enginex-mlu370-vllm/vllm-v0.6.2/tools/quant_tools/model_special.py

import re

# model_type, qkv_list, gate_up_list, is_gate_up
smooth_model_config = {
    "mllama": {
        "qkv_list": ["q_proj", "k_proj", "v_proj"],
        "gate_up_list": ["gate_proj", "up_proj"],
        "is_gate_up": True,
        "moe_list": None
    },
    "llama": {
        "qkv_list": ["q_proj", "k_proj", "v_proj"],
        "gate_up_list": ["gate_proj", "up_proj"],
        "is_gate_up": True,
        "moe_list": None
    },
    "qwen2_vl": {
        "qkv_list": ["q_proj", "k_proj", "v_proj"],
        "gate_up_list": ["gate_proj", "up_proj"],
        "is_gate_up": True,
        "moe_list": None,
        "skip_patterns": [r"^visual\.*"]
    },
    "qwen2": {
        "qkv_list": ["q_proj", "k_proj", "v_proj"],
        "gate_up_list": ["gate_proj", "up_proj"],
        "is_gate_up": True,
        "moe_list": None
    },
    "qwen": {
        "qkv_list": ["c_attn"],
        "gate_up_list": ["w2", "w1"],
        "is_gate_up": True,
        "moe_list": None
    },
    "baichuan": {
        "qkv_list": ["W_pack"],
        "gate_up_list": ["gate_proj", "up_proj"],
        "is_gate_up": True,
        "moe_list": None
    },
    "chatglm": {
        "qkv_list": ["query_key_value"],
        "gate_up_list": ["dense_h_to_4h"],
        "is_gate_up": True,
        "moe_list": None
    },
    "gpt_neox": {
        "qkv_list": ["query_key_value"],
        "gate_up_list": [],
        "is_gate_up": True,
        "moe_list": None
    },
    "mixtral": {
        "qkv_list": ["q_proj", "k_proj", "v_proj"],
        "gate_up_list": ["w1", "w3"],
        "is_gate_up": True,
        "moe_list": {
            "gate_up_list": ["block_sparse_moe.w13", "w1", "w3"],
            "down_list": ["block_sparse_moe.w2", "w2"],
            "is_merged": True
        }
    },
    "qwen2_moe": {
        "qkv_list": ["q_proj", "k_proj", "v_proj"],
        "gate_up_list": ["gate_proj", "up_proj"],
        "is_gate_up": True,
        "moe_list": {
            "gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
            "down_list": ["mlp.w2", "down_proj"],
            "is_merged": True
        }
    },
    "deepseek_v2": {
        "qkv_list": ["q_proj", "q_b_proj"],
        "gate_up_list": ["gate_proj", "up_proj"],
        "is_gate_up": True,
        "moe_list": {
            "gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
            "down_list": ["mlp.w2", "down_proj"],
            "is_merged": True
        },
        "skip_patterns": [r".*\.kv_b_proj\..*",]
    },
    "falcon": {
        "qkv_list": ["query_key_value"],
        "gate_up_list": ["dense_h_to_4h"],
        "is_gate_up": True,
        "moe_list": None
    },
    "bloom": {
        "qkv_list": ["query_key_value"],
        "gate_up_list": ["dense_h_to_4h"],
        "is_gate_up": False,
        "moe_list": None
    },
    "internlm2": {
        "qkv_list": ["wqkv"],
        "gate_up_list": ["gate_up_proj"],
        "is_gate_up": True,
        "moe_list": None
    },
    "hunyuan": {
        "qkv_list": ["q_proj", "k_proj", "v_proj"],
        "gate_up_list": ["gate_proj", "up_proj"],
        "is_gate_up": True,
        "moe_list": {
            "gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
            "down_list": ["mlp.w2", "down_proj"],
            "is_merged": True
        }
    },
    "phi3": {
        "qkv_list": ["qkv_proj"],
        "gate_up_list": ["gate_up_proj"],
        "is_gate_up": True,
        "moe_list": None
    },
}


def get_layer_weight_bias_name(model_type, layer_name):
    '''
    Specially adjust the condition that layer_name and weight/bias name are different,
    or the condithon that weight/bias name is not {layer_name}.weight/bias, such as:
    if model_type == "chatglm" and "output_layer" in layer_name:
        layer_name = "lm_head"
        weight_name = f"{layer_name}_weight"
        bias_name = f"{layer_name}_bias"
    Since vllm 0.5.3, vllm has obey this rule, so no special layer needs to be modified.
    '''
    weight_name = None
    bias_name = None

    # layers which need to be modified can be listed at here
    if model_type == "hunyuan" and "lm_head" in layer_name:
        layer_name = "model.embed_tokens"
        weight_name = "model.embed_tokens.weight"
        bias_name = "model.embed_tokens.bias"

    if weight_name is None:
        weight_name = f"{layer_name}.weight"
    if bias_name is None:
        bias_name = f"{layer_name}.bias"

    return layer_name, weight_name, bias_name


def modify_layer_weight_bias_name(model_type, named_parameters):
    '''
    modify special condition that vllm layer_name isn't same as hf layer name
    '''
    # Mapping for model type specific adjustments
    mapping = {
        "chatglm": {
            "transformer.embedding.weight": "transformer.embedding.word_embeddings.weight"
        },
    }

    if model_type in mapping:
        for old_key, new_key in mapping[model_type].items():
            if old_key in named_parameters:
                named_parameters[new_key] = named_parameters.pop(old_key)


def extract_numbers(string):
    '''
    extract a string to number
    '''
    # 使用正则表达式找到字符串中的所有数字部分
    matches = re.findall(r'\d+', string)

    # 将所有匹配的数字部分转换为整数
    numbers = [int(match) for match in matches]

    return numbers[-1] if len(numbers) > 0 else 0


def get_qkv_distribution(model_type, model_version, hf_config):
    '''
    Get qkv distribution: n3sh or 3nsh
    n3sh: [head_num, 3, head_size, hidden_size]
    3nsh: [3, head_num, head_size, hidden_size]
    vllm default qkv distribution is 3nsh, so here need to provide n3sh model info, tools will convert 3nsh to n3sh
    to be same as hugging face qkv distribution
    This is only for packge qkv layer and it's distribution is n3sh
    '''
    is_n3sh = False
    head_num = 0
    kv_head_num = 0
    if (model_type == "chatglm" and extract_numbers(model_version) == 0) or model_type in ["bloom", "gpt_neox"]:
        is_n3sh = True
        head_num = hf_config.num_attention_heads

        kv_head_num = head_num
    if model_type == "falcon":
        is_n3sh = True
        head_num = hf_config.num_attention_heads
        if hf_config.new_decoder_architecture:
            kv_head_num = hf_config.num_kv_heads
        elif hf_config.multi_query:
            kv_head_num = 1
        else:
            kv_head_num = head_num

    return is_n3sh, head_num, kv_head_num