xc-llm-ascend/vllm_ascend/quantization/modelslim_config.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
"""ModelSlim quantization configuration and model mappings for Ascend.

This module provides the AscendModelSlimConfig class for parsing quantization
configs generated by the ModelSlim tool, along with model-specific mappings.
"""

import glob
import json
import os
from collections.abc import Mapping
from types import MappingProxyType
from typing import Any, Optional

import regex as re
import torch
from vllm.config import get_current_vllm_config
from vllm.logger import logger
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import LinearBase
from vllm.model_executor.layers.quantization import register_quantization_config
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig, QuantizeMethodBase
from vllm.model_executor.layers.vocab_parallel_embedding import UnquantizedEmbeddingMethod, VocabParallelEmbedding
from vllm.model_executor.models.utils import WeightsMapper

from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, calc_split_factor

from .methods import get_scheme_class

# The config filename that ModelSlim generates after quantizing a model.
MODELSLIM_CONFIG_FILENAME = "quant_model_description.json"

# key: model_type
# value: dict of fused module name -> list of original module names
packed_modules_model_mapping: dict[str, dict[str, list[str]]] = {
    "qwen3_moe": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "qwen3_5": {
        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
        "gate_up_proj": ["gate_proj", "up_proj"],
        "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
        "in_proj_ba": ["in_proj_b", "in_proj_a"],
    },
    "qwen3_5_moe": {
        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
        "gate_up_proj": ["gate_proj", "up_proj"],
        "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
        "in_proj_ba": ["in_proj_b", "in_proj_a"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "deepseek_v2": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "deepseek_v3": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "pangu_ultra_moe": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "kimi_k2": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "deepseek_v32": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "glm_moe_dsa": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    # NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized;
    # NOTE 2.The description file generated by the current msmodelslim tool does not have
    # MTP layer info. Please manually add it and set the value to FLOAT.
    "deepseek_mtp": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "pangu_ultra_moe_mtp": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "qwen3_next": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": ["gate_proj", "up_proj"],
        "in_proj": ["in_proj_qkvz", "in_proj_ba"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "qwen2_5_vl": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
    },
    "qwen3_vl_moe": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "glm4_moe": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "glm4_moe_lite": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "glm4v_moe": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "glm4v_moe_text": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "longcat_flash": {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "minimax_m2": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"],
    },
    "qwen3_omni_moe": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "attn_qkv_proj": [
            "attn_q_proj",
            "attn_k_proj",
            "attn_v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "qwen2_5_omni": {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "attn_qkv_proj": [
            "attn_q_proj",
            "attn_k_proj",
            "attn_v_proj",
        ],
        "qkv": [
            "q",
            "k",
            "v",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
    },
}


def get_packed_modules_mapping(model_type: str) -> dict[str, list[str]]:
    """Get packed modules mapping for a model type.

    Args:
        model_type: The model type string (e.g., "deepseek_v3").

    Returns:
        Dictionary mapping fused module names to their component module names.
        Returns empty dict if model_type is not found.
    """
    return packed_modules_model_mapping.get(model_type, {})


def get_linear_quant_type(
    quant_description: dict[str, Any], prefix: str, packed_modules_mapping: dict[str, Any]
) -> str | None:
    """Determine the quantization type for a linear layer.

    Args:
        quant_description: The quantization description dictionary.
        prefix: The layer prefix.
        packed_modules_mapping: Mapping for packed/fused modules.

    Returns:
        The quantization type string (e.g., "W8A8_DYNAMIC").
    """
    proj_name = prefix.split(".")[-1]
    if proj_name in packed_modules_mapping:
        quant_type = None
        shard_prefixes = [
            prefix.replace(proj_name, shard_proj_name) for shard_proj_name in packed_modules_mapping[proj_name]
        ]
        for shard_prefix in shard_prefixes:
            shard_quant_type = quant_description[shard_prefix + ".weight"]

            if quant_type is None:
                quant_type = shard_quant_type
            elif shard_quant_type != quant_type:
                raise ValueError(
                    f"Not all shards of {prefix} are quantized with same quant type."
                    f"Shard {proj_name} uses {shard_quant_type}, but another shard"
                    f"use {quant_type}. Please check quantization config."
                )
    else:
        quant_type = quant_description[prefix + ".weight"]
    return quant_type


def get_quant_type_for_layer(
    quant_description: dict[str, Any],
    prefix: str,
    layer_type: str,
    packed_modules_mapping: dict[str, Any] | None = None,
) -> str | None:
    """Determine the quantization type for a layer.

    Args:
        quant_description: The quantization description dictionary.
        prefix: The layer prefix.
        layer_type: The type of layer ("linear", "moe", "attention").
        packed_modules_mapping: Mapping for packed/fused modules.

    Returns:
        The quantization type string (e.g., "W8A8_DYNAMIC").
    """
    if packed_modules_mapping is None:
        packed_modules_mapping = dict()
    # Attention
    if layer_type == "attention" and "fa_quant_type" in quant_description:
        return quant_description["fa_quant_type"]
    if layer_type == "attention" and "indexer_quant_type" in quant_description:
        return quant_description["indexer_quant_type"]
    # Linear / MoE
    return get_linear_quant_type(quant_description, prefix, packed_modules_mapping)


def create_scheme_for_layer(
    quant_description: dict[str, Any],
    prefix: str,
    layer_type: str,
    packed_modules_mapping: dict[str, Any] | None = None,
):
    """Create a quantization scheme instance for a layer.

    Args:
        quant_description: The quantization description dictionary.
        prefix: The layer prefix.
        layer_type: The type of layer ("linear", "moe", "attention").
        packed_modules_mapping: Mapping for packed/fused modules.

    Returns:
        An instance of the appropriate quantization scheme class.
    """
    logger.info_once("Using the vLLM Ascend modelslim Quantization now!")
    quant_type = get_quant_type_for_layer(quant_description, prefix, layer_type, packed_modules_mapping)

    if quant_type is None:
        raise ValueError(f"Could not determine quantization type for layer {prefix}.")

    # Use registry to get scheme class
    scheme_cls = get_scheme_class(quant_type, layer_type)
    if scheme_cls is not None:
        return scheme_cls()

    raise NotImplementedError(f"Currently, vLLM Ascend doesn't support {quant_type} for {layer_type}.")


@register_quantization_config(ASCEND_QUANTIZATION_METHOD)
class AscendModelSlimConfig(QuantizationConfig):
    """Config class for Ascend ModelSlim quantization.

    This class is a general class that parses quantization configs
    that are supported on Ascend hardware, specifically for models
    quantized using the ModelSlim tool.
    """

    def __init__(self, quant_config: dict[str, Any] | None = None):
        super().__init__()
        self.quant_description = quant_config if quant_config is not None else {}
        self._apply_extra_quant_adaptations()
        self.model_type: str | None = None
        self.hf_to_vllm_mapper: WeightsMapper | None = None
        self._mapper_applied = False
        self._add_kvcache_quant_metadata()

    def __repr__(self) -> str:
        return "AscendModelSlimConfig:\n" + super().__repr__()

    @classmethod
    def get_name(cls) -> str:
        return ASCEND_QUANTIZATION_METHOD

    @classmethod
    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
        return [torch.int8, torch.float16, torch.bfloat16]

    @classmethod
    def get_min_capability(cls) -> int:
        raise NotImplementedError('Ascend hardware dose not support "get_min_capability" feature.')

    @classmethod
    def get_config_filenames(cls) -> list[str]:
        # Return empty list so that vllm's get_quant_config() skips the
        # file-based lookup (which raises an unfriendly "Cannot find the
        # config file for ascend" error when the model is not quantized).
        # Instead, the config file is loaded in maybe_update_config(),
        # which can provide a user-friendly error message.
        return []

    @classmethod
    def from_config(cls, config: dict[str, Any]) -> "AscendModelSlimConfig":
        return cls(config)

    @classmethod
    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> str | None:
        if hf_quant_cfg is not None:
            quant_method = hf_quant_cfg.get("quant_method", None)
            if not quant_method and torch.npu.is_available():
                return ASCEND_QUANTIZATION_METHOD
        return None

    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
        """Apply the vLLM model-specific mapper to this quantization config.

        This method is called by vLLM to apply the model-specific weight mapper
        to the quantization configuration. It directly uses the forward mapping
        (HF -> vLLM) to transform keys in quant_description from HF format to
        vLLM format.

        Args:
            hf_to_vllm_mapper: The WeightsMapper instance provided by vLLM
                that contains model-specific prefix mappings (HF to vLLM).
        """
        if self._mapper_applied and self.hf_to_vllm_mapper is hf_to_vllm_mapper:
            return

        self.hf_to_vllm_mapper = hf_to_vllm_mapper
        self._mapper_applied = True

        if self.quant_description:
            self.quant_description = hf_to_vllm_mapper.apply_dict(self.quant_description)
            self._add_kvcache_quant_metadata()
            logger.info("Applied hf_to_vllm_mapper to quant_description keys")

    def quant_prefix_mapper(self, model_type: str, prefix: str) -> str:
        self.model_type = model_type
        return prefix

    def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]:
        from .method_adapters import (
            AscendEmbeddingMethod,
            AscendFusedMoEMethod,
            AscendKVCacheMethod,
            AscendLinearMethod,
        )

        vllm_config = get_current_vllm_config()
        model_type = vllm_config.model_config.hf_config.model_type

        if model_type in ["minimax", "minimax_m2"]:
            # Adapt to Minimax architecture: update layer names to MoE convention
            prefix = prefix.replace("mlp", "block_sparse_moe")
            # Normalize the prefix by stripping specific expert indices (e.g., 'experts.0' -> 'experts')
            parts = prefix.split(".")
            if "experts" in parts and len(parts) > 2:
                exp_idx = parts.index("experts")
                if exp_idx + 1 < len(parts) and parts[exp_idx + 1].isdigit():
                    parts = parts[: exp_idx + 1]
                    prefix = ".".join(parts)

        if model_type in packed_modules_model_mapping:
            self.packed_modules_mapping = packed_modules_model_mapping[model_type]
        prefix = self.quant_prefix_mapper(model_type, prefix)

        if isinstance(layer, LinearBase):
            if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
                # Delayed import to avoid circular import
                from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod

                return AscendUnquantizedLinearMethod()
            scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping)
            return AscendLinearMethod(scheme)
        elif isinstance(layer, AttentionLayerBase) and (
            self.is_fa_quant_layer(prefix) or self.is_indexer_quant_layer(prefix)
        ):
            scheme = create_scheme_for_layer(self.quant_description, prefix, "attention", self.packed_modules_mapping)
            return AscendKVCacheMethod(scheme)
        elif isinstance(layer, FusedMoE):
            if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
                # Delayed import to avoid circular import
                from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod

                return AscendUnquantizedFusedMoEMethod(layer.moe_config)
            scheme = create_scheme_for_layer(self.quant_description, prefix, "moe", self.packed_modules_mapping)
            return AscendFusedMoEMethod(scheme, layer.moe_config)
        elif isinstance(layer, VocabParallelEmbedding):
            if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
                return UnquantizedEmbeddingMethod()
            scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping)
            return AscendEmbeddingMethod(scheme)
        return None

    def is_layer_skipped_ascend(self, prefix: str, fused_mapping: Mapping[str, list[str]] = MappingProxyType({})):
        # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
        proj_name = prefix.split(".")[-1]
        if proj_name in fused_mapping:
            shard_prefixes = [
                prefix.replace(proj_name, shard_proj_name) for shard_proj_name in fused_mapping[proj_name]
            ]

            is_skipped = None
            for shard_prefix in shard_prefixes:
                is_shard_skipped = self.quant_description[shard_prefix + ".weight"] == "FLOAT"

                if is_skipped is None:
                    is_skipped = is_shard_skipped
                elif is_shard_skipped != is_skipped:
                    raise ValueError(
                        f"Detected some but not all shards of {prefix} "
                        "are quantized. All shards of fused layers "
                        "to have the same precision."
                    )
        else:
            is_skipped = any(
                key.startswith(prefix) and key.endswith(".weight") and value == "FLOAT"
                for key, value in self.quant_description.items()
            )

        assert is_skipped is not None
        return is_skipped

    def is_fa_quant_layer(self, prefix):
        if self.enable_fa_quant:
            layer_id_str = "".join(re.findall(r"\.(\d+)\.", prefix))
            if layer_id_str.isdigit() and int(layer_id_str) in self.kvcache_quant_layers:
                return True
        return False

    def is_indexer_quant_layer(self, prefix):
        if self.enable_indexer_quant:
            layer_id_str = "".join(re.findall(r"\.(\d+)\.", prefix))
            if layer_id_str.isdigit() and int(layer_id_str) in self.indexer_quant_layers:
                return True
        return False

    def enabling_fa_quant(self, vllm_config, layer_name) -> bool:
        is_decode_instance = (
            vllm_config.kv_transfer_config is not None
            and vllm_config.kv_transfer_config.is_kv_consumer
            and not vllm_config.kv_transfer_config.is_kv_producer
        )
        return bool(is_decode_instance and self.is_fa_quant_layer(layer_name))

    def get_kv_quant_dtype(self, layer_name, cache_dtype, model_config):
        if self.enable_fa_quant and self.is_fa_quant_layer(layer_name):
            ori_dtype = model_config.dtype
            quant_dtype = torch.int8
            # For MLA models like deepseek, we only quantify K cache to ensure accuracy
            if model_config.use_mla:
                return quant_dtype, ori_dtype
            else:
                return quant_dtype, quant_dtype
        return cache_dtype, cache_dtype

    def get_kv_quant_split_factor(self, layer_name, kv_head_dim_list):
        if self.enable_fa_quant and self.is_fa_quant_layer(layer_name):
            k_quant_head_dim = kv_head_dim_list[0]
            v_quant_head_dim = kv_head_dim_list[1] * 2
            kv_head_dim_list = [k_quant_head_dim, v_quant_head_dim]
        return calc_split_factor(kv_head_dim_list)

    def maybe_update_config(self, model_name: str, revision: str | None = None) -> None:
        """Load the ModelSlim quantization config from model directory.

        This method is called by vllm after get_quant_config() returns
        successfully. Since we return an empty list from get_config_filenames()
        to bypass vllm's built-in file lookup, we do the actual config loading
        here and provide user-friendly error messages when the config is missing.

        Works with both local directories (``/path/to/model``) and remote
        repository identifiers (``org/model-name``).  For remote repos the
        lookup goes through the HuggingFace / ModelScope cache via
        ``get_model_file`` to fetch the config if not already cached.

        Args:
            model_name: Path to the model directory or HuggingFace /
                ModelScope repo id.
            revision: Optional revision (branch, tag, or commit hash) for
                remote repos.
        """
        from vllm_ascend.quantization.utils import get_model_file

        # If quant_description is already populated (e.g. from from_config()),
        # there is nothing to do.
        if self.quant_description:
            return

        # Try to get the config file (local or remote)
        config_path = get_model_file(model_name, MODELSLIM_CONFIG_FILENAME, revision=revision)

        if config_path is not None:
            with open(config_path) as f:
                self.quant_description = json.load(f)
            self._apply_extra_quant_adaptations()
            self._add_kvcache_quant_metadata()
            return

        # Collect diagnostic info for the error message
        json_names: list[str] = []
        if os.path.isdir(model_name):
            json_files = glob.glob(os.path.join(model_name, "*.json"))
            json_names = [os.path.basename(f) for f in json_files]

        # Config file not found - raise a friendly error message
        raise ValueError(
            "\n"
            + "=" * 80
            + "\n"
            + "ERROR: ModelSlim Quantization Config Not Found\n"
            + "=" * 80
            + "\n"
            + "\n"
            + f"You have enabled '--quantization {ASCEND_QUANTIZATION_METHOD}' "
            + "(ModelSlim quantization),\n"
            + f"but the model '{model_name}' does not contain the required\n"
            + f"quantization config file ('{MODELSLIM_CONFIG_FILENAME}').\n"
            + "\n"
            + "This usually means the model weights are NOT quantized by "
            + "ModelSlim.\n"
            + "\n"
            + "Please choose one of the following solutions:\n"
            + "\n"
            + "  Solution 1: Remove the quantization option "
            + "(for float/unquantized models)\n"
            + "  "
            + "-" * 58
            + "\n"
            + f"    Remove '--quantization {ASCEND_QUANTIZATION_METHOD}' from "
            + "your command if you want to\n"
            + "    run the model with the original (float) weights.\n"
            + "\n"
            + "    Example:\n"
            + f"      vllm serve {model_name}\n"
            + "\n"
            + "  Solution 2: Quantize your model weights with ModelSlim first\n"
            + "  "
            + "-" * 58
            + "\n"
            + "    Use the ModelSlim tool to quantize your model weights "
            + "before deployment.\n"
            + "    After quantization, the model directory should contain "
            + f"'{MODELSLIM_CONFIG_FILENAME}'.\n"
            + "    For more information, please refer to:\n"
            + "    https://gitee.com/ascend/msit/tree/master/msmodelslim\n"
            + "\n"
            + (f"  (Found JSON files in model directory: {json_names})\n" if json_names else "")
            + "=" * 80
        )

    def _apply_extra_quant_adaptations(self) -> None:
        """Apply extra adaptations to the quant_description dict.

        This handles known key transformations such as shared_head and
        weight_packed mappings.
        """
        extra_quant_dict = {}
        for k in self.quant_description:
            if "shared_head" in k:
                new_k = k.replace(".shared_head.", ".")
                extra_quant_dict[new_k] = self.quant_description[k]
            if "weight_packed" in k:
                new_k = k.replace("weight_packed", "weight")
                extra_quant_dict[new_k] = self.quant_description[k]
        self.quant_description.update(extra_quant_dict)

    def _add_kvcache_quant_metadata(self):
        fa_quant_type = self.quant_description.get("fa_quant_type", "")
        self.enable_fa_quant = fa_quant_type != ""
        self.kvcache_quant_layers = []
        indexer_quant_type = self.quant_description.get("indexer_quant_type", "")
        self.enable_indexer_quant = indexer_quant_type != ""
        self.indexer_quant_layers = []
        if self.enable_fa_quant or self.enable_indexer_quant:
            for key in self.quant_description:
                _id = "".join(re.findall(r"\.(\d+)\.", key))
                if "fa_k.scale" in key:
                    self.kvcache_quant_layers.append(int(_id))
                if "indexer.quant_type" in key:
                    self.indexer_quant_layers.append(int(_id))