xc-llm-ascend/vllm_ascend/quantization/modelslim_config.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
"""ModelSlim quantization configuration and model mappings for Ascend.

This module provides the AscendModelSlimConfig class for parsing quantization
configs generated by the ModelSlim tool, along with model-specific mappings.
"""

import glob
import json
import os
from collections.abc import Mapping
from types import MappingProxyType
from typing import Any, Optional

import regex as re
import torch
from vllm.config import get_current_vllm_config
from vllm.logger import logger
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import LinearBase
from vllm.model_executor.layers.quantization import register_quantization_config
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig, QuantizeMethodBase
from vllm.model_executor.layers.vocab_parallel_embedding import UnquantizedEmbeddingMethod, VocabParallelEmbedding
from vllm.model_executor.models.utils import WeightsMapper

from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, calc_split_factor

from .methods import get_scheme_class

# The config filename that ModelSlim generates after quantizing a model.
MODELSLIM_CONFIG_FILENAME = "quant_model_description.json"

# key: model_type
# value: vLLM prefix -> HF prefix mapping (used to convert vLLM layer names to HF format
# for looking up keys in quant_model_description.json)
QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = {
    "qwen3_vl_moe": {
        "visual.": "model.visual.",
        "language_model.lm_head.": "lm_head.",
        "language_model.model.": "model.language_model.",
    },
    "qwen3_vl": {
        "visual.": "model.visual.",
        "language_model.lm_head.": "lm_head.",
        "language_model.model.": "model.language_model.",
    },
    "kimi_k25": {
        "mm_projector.linear_1": "mm_projector.proj.0",
        "mm_projector.linear_2": "mm_projector.proj.2",
    },
    "qwen3_omni_moe": {
        "language_model.lm_head.": "thinker.lm_head.",
        "language_model.model.": "thinker.model.",
        "visual.": "thinker.visual.",
    },
    "qwen2_5_omni": {
        "language_model.lm_head.": "thinker.lm_head.",
        "language_model.model.": "thinker.model.",
        "visual.": "thinker.visual.",
    },
    "qwen2_5_omni_text": {
        "language_model.": "thinker.",
        "language_model.lm_head.": "thinker.lm_head.",
        "language_model.model.": "thinker.model.",
    },
    "glm4v_moe": {
        "visual.": "model.visual.",
        "language_model.lm_head.": "lm_head.",
        "language_model.model.": "model.language_model.",
    },
    "glm4v_moe_text": {
        "visual.": "model.visual.",
        "language_model.lm_head.": "lm_head.",
        "language_model.model.": "model.language_model.",
    },
    "kimi_k2": {
        "language_model.layers.": "language_model.model.layers.",
        # mm projector
        "mm_projector.proj.0": "mm_projector.linear_1",
        "mm_projector.proj.2": "mm_projector.linear_2",
    },
}

# key: model_type
# value: dict of fused module name -> list of original module names
packed_modules_model_mapping: dict[str, dict[str, list[str]]] = {
    "qwen3_moe": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "qwen3_5": {
        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
        "gate_up_proj": ["gate_proj", "up_proj"],
        "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
        "in_proj_ba": ["in_proj_b", "in_proj_a"],
    },
    "qwen3_5_moe": {
        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
        "gate_up_proj": ["gate_proj", "up_proj"],
        "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
        "in_proj_ba": ["in_proj_b", "in_proj_a"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "deepseek_v2": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "deepseek_v3": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "pangu_ultra_moe": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "kimi_k2": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "deepseek_v32": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "glm_moe_dsa": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    # NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized;
    # NOTE 2.The description file generated by the current msmodelslim tool does not have
    # MTP layer info. Please manually add it and set the value to FLOAT.
    "deepseek_mtp": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "pangu_ultra_moe_mtp": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "qwen3_next": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": ["gate_proj", "up_proj"],
        "in_proj": ["in_proj_qkvz", "in_proj_ba"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "qwen2_5_vl": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
    },
    "qwen3_vl_moe": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "glm4_moe": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "glm4_moe_lite": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "glm4v_moe": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "glm4v_moe_text": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "longcat_flash": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "minimax_m2": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"],
    },
    "qwen3_omni_moe": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "attn_qkv_proj": [
            "attn_q_proj",
            "attn_k_proj",
            "attn_v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "qwen2_5_omni": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "attn_qkv_proj": [
            "attn_q_proj",
            "attn_k_proj",
            "attn_v_proj",
        ],
        "qkv": [
            "q",
            "k",
            "v",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
    },
}


def get_packed_modules_mapping(model_type: str) -> dict[str, list[str]]:
    """Get packed modules mapping for a model type.

    Args:
        model_type: The model type string (e.g., "deepseek_v3").

    Returns:
        Dictionary mapping fused module names to their component module names.
        Returns empty dict if model_type is not found.
    """
    return packed_modules_model_mapping.get(model_type, {})


def get_prefix_mapping(model_type: str) -> dict[str, str]:
    """Get prefix mapping for a model type.

    Args:
        model_type: The model type string (e.g., "qwen3_vl_moe").

    Returns:
        Dictionary mapping original prefixes to new prefixes.
        Returns empty dict if model_type is not found.
    """
    return QUANT_MODEL_PREFIX_MAPPINGS.get(model_type, {})


def get_linear_quant_type(
    quant_description: dict[str, Any], prefix: str, packed_modules_mapping: dict[str, Any]
) -> str | None:
    """Determine the quantization type for a linear layer.

    Args:
        quant_description: The quantization description dictionary.
        prefix: The layer prefix.
        packed_modules_mapping: Mapping for packed/fused modules.

    Returns:
        The quantization type string (e.g., "W8A8_DYNAMIC").
    """
    proj_name = prefix.split(".")[-1]
    if proj_name in packed_modules_mapping:
        quant_type = None
        shard_prefixes = [
            prefix.replace(proj_name, shard_proj_name) for shard_proj_name in packed_modules_mapping[proj_name]
        ]
        for shard_prefix in shard_prefixes:
            shard_quant_type = quant_description[shard_prefix + ".weight"]

            if quant_type is None:
                quant_type = shard_quant_type
            elif shard_quant_type != quant_type:
                raise ValueError(
                    f"Not all shards of {prefix} are quantized with same quant type."
                    f"Shard {proj_name} uses {shard_quant_type}, but another shard"
                    f"use {quant_type}. Please check quantization config."
                )
    else:
        quant_type = quant_description[prefix + ".weight"]
    return quant_type


def get_quant_type_for_layer(
    quant_description: dict[str, Any],
    prefix: str,
    layer_type: str,
    packed_modules_mapping: dict[str, Any] | None = None,
) -> str | None:
    """Determine the quantization type for a layer.

    Args:
        quant_description: The quantization description dictionary.
        prefix: The layer prefix.
        layer_type: The type of layer ("linear", "moe", "attention").
        packed_modules_mapping: Mapping for packed/fused modules.

    Returns:
        The quantization type string (e.g., "W8A8_DYNAMIC").
    """
    if packed_modules_mapping is None:
        packed_modules_mapping = dict()
    # Attention
    if layer_type == "attention" and "fa_quant_type" in quant_description:
        return quant_description["fa_quant_type"]
    # Linear / MoE
    return get_linear_quant_type(quant_description, prefix, packed_modules_mapping)


def create_scheme_for_layer(
    quant_description: dict[str, Any],
    prefix: str,
    layer_type: str,
    packed_modules_mapping: dict[str, Any] | None = None,
):
    """Create a quantization scheme instance for a layer.

    Args:
        quant_description: The quantization description dictionary.
        prefix: The layer prefix.
        layer_type: The type of layer ("linear", "moe", "attention").
        packed_modules_mapping: Mapping for packed/fused modules.

    Returns:
        An instance of the appropriate quantization scheme class.
    """
    logger.info_once("Using the vLLM Ascend modelslim Quantization now!")
    quant_type = get_quant_type_for_layer(quant_description, prefix, layer_type, packed_modules_mapping)

    if quant_type is None:
        raise ValueError(f"Could not determine quantization type for layer {prefix}.")

    # Use registry to get scheme class
    scheme_cls = get_scheme_class(quant_type, layer_type)
    if scheme_cls is not None:
        return scheme_cls()

    raise NotImplementedError(f"Currently, vLLM Ascend doesn't support {quant_type} for {layer_type}.")


@register_quantization_config(ASCEND_QUANTIZATION_METHOD)
class AscendModelSlimConfig(QuantizationConfig):
    """Config class for Ascend ModelSlim quantization.

    This class is a general class that parses quantization configs
    that are supported on Ascend hardware, specifically for models
    quantized using the ModelSlim tool.
    """

    def __init__(self, quant_config: dict[str, Any] | None = None):
        super().__init__()
        self.quant_description = quant_config if quant_config is not None else {}
        # TODO(whx): remove this adaptation after adding "shared_head"
        # to prefix of DeepSeekShareHead in vLLM.
        extra_quant_dict = {}
        for k in self.quant_description:
            if "shared_head" in k:
                new_k = k.replace(".shared_head.", ".")
                extra_quant_dict[new_k] = self.quant_description[k]
            if "weight_packed" in k:
                new_k = k.replace("weight_packed", "weight")
                extra_quant_dict[new_k] = self.quant_description[k]
        self.quant_description.update(extra_quant_dict)
        # Initialize attributes for type checking
        self.model_type: str | None = None
        self.hf_to_vllm_mapper: WeightsMapper | None = None
        self.vllm_to_hf_mapper: WeightsMapper | None = None
        self._add_kvcache_quant_metadata()

    def __repr__(self) -> str:
        return "AscendModelSlimConfig:\n" + super().__repr__()

    @classmethod
    def get_name(cls) -> str:
        return ASCEND_QUANTIZATION_METHOD

    @classmethod
    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
        return [torch.int8, torch.float16, torch.bfloat16]

    @classmethod
    def get_min_capability(cls) -> int:
        raise NotImplementedError('Ascend hardware dose not support "get_min_capability" feature.')

    @classmethod
    def get_config_filenames(cls) -> list[str]:
        # Return empty list so that vllm's get_quant_config() skips the
        # file-based lookup (which raises an unfriendly "Cannot find the
        # config file for ascend" error when the model is not quantized).
        # Instead, the config file is loaded in maybe_update_config(),
        # which can provide a user-friendly error message.
        return []

    @classmethod
    def from_config(cls, config: dict[str, Any]) -> "AscendModelSlimConfig":
        return cls(config)

    @classmethod
    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> str | None:
        if hf_quant_cfg is not None:
            quant_method = hf_quant_cfg.get("quant_method", None)
            if not quant_method and torch.npu.is_available():
                return ASCEND_QUANTIZATION_METHOD
        return None

    # TODO: Modify the key values in self.quant_description instead of flipping the hf_to_vllm_mapper
    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
        """Apply the vLLM model-specific mapper to this quantization config.

        This method is called by vLLM to apply the model-specific weight mapper
        to the quantization configuration. It creates a reverse mapper to convert
        vLLM prefixes back to HF format for looking up keys in quant_config.json.

        Args:
            hf_to_vllm_mapper: The WeightsMapper instance provided by vLLM
                that contains model-specific prefix mappings (HF to vLLM).
        """
        # Check if we already have a valid vllm_to_hf_mapper for this hf_to_vllm_mapper
        if hasattr(self, "hf_to_vllm_mapper") and self.hf_to_vllm_mapper is hf_to_vllm_mapper:
            # Same mapper instance, no need to recreate
            return

        # Store the original mapper
        self.hf_to_vllm_mapper = hf_to_vllm_mapper

        # Try different ways to get the mapping based on WeightsMapper implementation
        mapping_attrs = ["orig_to_new_prefix"]
        orig_to_new_prefix = {}

        for attr_name in mapping_attrs:
            if hasattr(hf_to_vllm_mapper, attr_name):
                orig_to_new_prefix = getattr(hf_to_vllm_mapper, attr_name)
                break

        # Create reverse mapping (vLLM -> HF), skipping empty values
        vllm_to_hf_mapping = {}
        for orig_prefix, new_prefix in orig_to_new_prefix.items():
            # Skip empty values to avoid invalid keys in reverse mapping
            if new_prefix:
                vllm_to_hf_mapping[new_prefix] = orig_prefix

        # Create and store the reverse WeightsMapper instance
        if vllm_to_hf_mapping:
            self.vllm_to_hf_mapper = WeightsMapper(orig_to_new_prefix=vllm_to_hf_mapping)
            logger.debug(f"Created reverse mapping from hf_to_vllm_mapper: {vllm_to_hf_mapping}")
        else:
            logger.info("No valid reverse mapping found for WeightsMapper.")

    def quant_prefix_mapper(self, model_type: str, prefix: str) -> str:
        # Store model_type for reference
        self.model_type = model_type

        # Check if manual mapping exists for this model type
        # Manual mapping takes priority and is used exclusively to avoid conflicts
        if model_type in QUANT_MODEL_PREFIX_MAPPINGS:
            manual_mapping = QUANT_MODEL_PREFIX_MAPPINGS[model_type]
            # Manual mapping is already in vLLM -> HF direction, use directly
            mapper = WeightsMapper(orig_to_new_prefix=manual_mapping)
            return mapper._map_name(prefix)

        # Use the reverse mapper (vLLM to HF) if available
        if hasattr(self, "vllm_to_hf_mapper") and self.vllm_to_hf_mapper:
            return self.vllm_to_hf_mapper._map_name(prefix)

        # Fall back to manual mapping for backward compatibility (simplified)
        # This is only used if apply_vllm_mapper wasn't called or failed
        prefix_mapping = QUANT_MODEL_PREFIX_MAPPINGS.get(model_type)
        if prefix_mapping:
            # Manual mapping is already in vLLM -> HF direction, use directly
            mapper = WeightsMapper(orig_to_new_prefix=prefix_mapping)
            return mapper._map_name(prefix)

        return prefix

    def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]:
        from .method_adapters import (
            AscendEmbeddingMethod,
            AscendFusedMoEMethod,
            AscendKVCacheMethod,
            AscendLinearMethod,
        )

        vllm_config = get_current_vllm_config()
        model_type = vllm_config.model_config.hf_config.model_type

        if model_type in ["minimax", "minimax_m2"]:
            # Adapt to Minimax architecture: update layer names to MoE convention
            prefix = prefix.replace("mlp", "block_sparse_moe")
            # Normalize the prefix by stripping specific expert indices (e.g., 'experts.0' -> 'experts')
            parts = prefix.split(".")
            if "experts" in parts and len(parts) > 2:
                exp_idx = parts.index("experts")
                if exp_idx + 1 < len(parts) and parts[exp_idx + 1].isdigit():
                    parts = parts[: exp_idx + 1]
                    prefix = ".".join(parts)

        if model_type in packed_modules_model_mapping:
            self.packed_modules_mapping = packed_modules_model_mapping[model_type]
        prefix = self.quant_prefix_mapper(model_type, prefix)

        if isinstance(layer, LinearBase):
            if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
                # Delayed import to avoid circular import
                from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod

                return AscendUnquantizedLinearMethod()
            scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping)
            return AscendLinearMethod(scheme)
        elif isinstance(layer, AttentionLayerBase) and self.is_fa_quant_layer(prefix):
            scheme = create_scheme_for_layer(self.quant_description, prefix, "attention", self.packed_modules_mapping)
            return AscendKVCacheMethod(scheme)
        elif isinstance(layer, FusedMoE):
            if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
                # Delayed import to avoid circular import
                from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod

                return AscendUnquantizedFusedMoEMethod(layer.moe_config)
            scheme = create_scheme_for_layer(self.quant_description, prefix, "moe", self.packed_modules_mapping)
            return AscendFusedMoEMethod(scheme, layer.moe_config)
        elif isinstance(layer, VocabParallelEmbedding):
            if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
                return UnquantizedEmbeddingMethod()
            scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping)
            return AscendEmbeddingMethod(scheme)
        return None

    def is_layer_skipped_ascend(self, prefix: str, fused_mapping: Mapping[str, list[str]] = MappingProxyType({})):
        # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
        proj_name = prefix.split(".")[-1]
        if proj_name in fused_mapping:
            shard_prefixes = [
                prefix.replace(proj_name, shard_proj_name) for shard_proj_name in fused_mapping[proj_name]
            ]

            is_skipped = None
            for shard_prefix in shard_prefixes:
                is_shard_skipped = self.quant_description[shard_prefix + ".weight"] == "FLOAT"

                if is_skipped is None:
                    is_skipped = is_shard_skipped
                elif is_shard_skipped != is_skipped:
                    raise ValueError(
                        f"Detected some but not all shards of {prefix} "
                        "are quantized. All shards of fused layers "
                        "to have the same precision."
                    )
        else:
            is_skipped = any(
                key.startswith(prefix) and key.endswith(".weight") and value == "FLOAT"
                for key, value in self.quant_description.items()
            )

        assert is_skipped is not None
        return is_skipped

    def is_fa_quant_layer(self, prefix):
        if self.enable_fa_quant:
            layer_id_str = "".join(re.findall(r"\.(\d+)\.", prefix))
            if layer_id_str.isdigit() and int(layer_id_str) in self.kvcache_quant_layers:
                return True
        return False

    def enabling_fa_quant(self, vllm_config, layer_name) -> bool:
        is_decode_instance = (
            vllm_config.kv_transfer_config is not None
            and vllm_config.kv_transfer_config.is_kv_consumer
            and not vllm_config.kv_transfer_config.is_kv_producer
        )
        return bool(is_decode_instance and self.is_fa_quant_layer(layer_name))

    def get_kv_quant_dtype(self, layer_name, cache_dtype, model_config):
        if self.enable_fa_quant and self.is_fa_quant_layer(layer_name):
            ori_dtype = model_config.dtype
            quant_dtype = torch.int8
            # For MLA models like deepseek, we only quantify K cache to ensure accuracy
            if model_config.use_mla:
                return quant_dtype, ori_dtype
            else:
                return quant_dtype, quant_dtype
        return cache_dtype, cache_dtype

    def get_kv_quant_split_factor(self, layer_name, kv_head_dim_list):
        if self.enable_fa_quant and self.is_fa_quant_layer(layer_name):
            k_quant_head_dim = kv_head_dim_list[0]
            v_quant_head_dim = kv_head_dim_list[1] * 2
            kv_head_dim_list = [k_quant_head_dim, v_quant_head_dim]
        return calc_split_factor(kv_head_dim_list)

    def maybe_update_config(self, model_name: str, revision: str | None = None) -> None:
        """Load the ModelSlim quantization config from model directory.

        This method is called by vllm after get_quant_config() returns
        successfully. Since we return an empty list from get_config_filenames()
        to bypass vllm's built-in file lookup, we do the actual config loading
        here and provide user-friendly error messages when the config is missing.

        Works with both local directories (``/path/to/model``) and remote
        repository identifiers (``org/model-name``).  For remote repos the
        lookup goes through the HuggingFace / ModelScope cache via
        ``get_model_file`` to fetch the config if not already cached.

        Args:
            model_name: Path to the model directory or HuggingFace /
                ModelScope repo id.
            revision: Optional revision (branch, tag, or commit hash) for
                remote repos.
        """
        from vllm_ascend.quantization.utils import get_model_file

        # If quant_description is already populated (e.g. from from_config()),
        # there is nothing to do.
        if self.quant_description:
            return

        # Try to get the config file (local or remote)
        config_path = get_model_file(model_name, MODELSLIM_CONFIG_FILENAME, revision=revision)

        if config_path is not None:
            with open(config_path) as f:
                self.quant_description = json.load(f)
            self._apply_extra_quant_adaptations()
            self._add_kvcache_quant_metadata()
            return

        # Collect diagnostic info for the error message
        json_names: list[str] = []
        if os.path.isdir(model_name):
            json_files = glob.glob(os.path.join(model_name, "*.json"))
            json_names = [os.path.basename(f) for f in json_files]

        # Config file not found - raise a friendly error message
        raise ValueError(
            "\n"
            + "=" * 80
            + "\n"
            + "ERROR: ModelSlim Quantization Config Not Found\n"
            + "=" * 80
            + "\n"
            + "\n"
            + f"You have enabled '--quantization {ASCEND_QUANTIZATION_METHOD}' "
            + "(ModelSlim quantization),\n"
            + f"but the model '{model_name}' does not contain the required\n"
            + f"quantization config file ('{MODELSLIM_CONFIG_FILENAME}').\n"
            + "\n"
            + "This usually means the model weights are NOT quantized by "
            + "ModelSlim.\n"
            + "\n"
            + "Please choose one of the following solutions:\n"
            + "\n"
            + "  Solution 1: Remove the quantization option "
            + "(for float/unquantized models)\n"
            + "  "
            + "-" * 58
            + "\n"
            + f"    Remove '--quantization {ASCEND_QUANTIZATION_METHOD}' from "
            + "your command if you want to\n"
            + "    run the model with the original (float) weights.\n"
            + "\n"
            + "    Example:\n"
            + f"      vllm serve {model_name}\n"
            + "\n"
            + "  Solution 2: Quantize your model weights with ModelSlim first\n"
            + "  "
            + "-" * 58
            + "\n"
            + "    Use the ModelSlim tool to quantize your model weights "
            + "before deployment.\n"
            + "    After quantization, the model directory should contain "
            + f"'{MODELSLIM_CONFIG_FILENAME}'.\n"
            + "    For more information, please refer to:\n"
            + "    https://gitee.com/ascend/msit/tree/master/msmodelslim\n"
            + "\n"
            + (f"  (Found JSON files in model directory: {json_names})\n" if json_names else "")
            + "=" * 80
        )

    def _apply_extra_quant_adaptations(self) -> None:
        """Apply extra adaptations to the quant_description dict.

        This handles known key transformations such as shared_head and
        weight_packed mappings.
        """
        extra_quant_dict = {}
        for k in self.quant_description:
            if "shared_head" in k:
                new_k = k.replace(".shared_head.", ".")
                extra_quant_dict[new_k] = self.quant_description[k]
            if "weight_packed" in k:
                new_k = k.replace("weight_packed", "weight")
                extra_quant_dict[new_k] = self.quant_description[k]
        self.quant_description.update(extra_quant_dict)

    def get_scaled_act_names(self) -> list[str]:
        return []

    def _add_kvcache_quant_metadata(self):
        fa_quant_type = self.quant_description.get("fa_quant_type", "")
        self.enable_fa_quant = fa_quant_type != ""
        self.kvcache_quant_layers = []
        if self.enable_fa_quant:
            for key in self.quant_description:
                if "fa_k.scale" in key:
                    _id = "".join(re.findall(r"\.(\d+)\.", key))
                    self.kvcache_quant_layers.append(int(_id))