Files
2026-02-04 17:22:39 +08:00

207 lines
6.3 KiB
Python
Executable File

import re
# model_type, qkv_list, gate_up_list, is_gate_up
smooth_model_config = {
"mllama": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None
},
"llama": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None
},
"qwen2_vl": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None,
"skip_patterns": [r"^visual\.*"]
},
"qwen2": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None
},
"qwen": {
"qkv_list": ["c_attn"],
"gate_up_list": ["w2", "w1"],
"is_gate_up": True,
"moe_list": None
},
"baichuan": {
"qkv_list": ["W_pack"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": None
},
"chatglm": {
"qkv_list": ["query_key_value"],
"gate_up_list": ["dense_h_to_4h"],
"is_gate_up": True,
"moe_list": None
},
"gpt_neox": {
"qkv_list": ["query_key_value"],
"gate_up_list": [],
"is_gate_up": True,
"moe_list": None
},
"mixtral": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["w1", "w3"],
"is_gate_up": True,
"moe_list": {
"gate_up_list": ["block_sparse_moe.w13", "w1", "w3"],
"down_list": ["block_sparse_moe.w2", "w2"],
"is_merged": True
}
},
"qwen2_moe": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": {
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
"down_list": ["mlp.w2", "down_proj"],
"is_merged": True
}
},
"deepseek_v2": {
"qkv_list": ["q_proj", "q_b_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": {
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
"down_list": ["mlp.w2", "down_proj"],
"is_merged": True
},
"skip_patterns": [r".*\.kv_b_proj\..*",]
},
"falcon": {
"qkv_list": ["query_key_value"],
"gate_up_list": ["dense_h_to_4h"],
"is_gate_up": True,
"moe_list": None
},
"bloom": {
"qkv_list": ["query_key_value"],
"gate_up_list": ["dense_h_to_4h"],
"is_gate_up": False,
"moe_list": None
},
"internlm2": {
"qkv_list": ["wqkv"],
"gate_up_list": ["gate_up_proj"],
"is_gate_up": True,
"moe_list": None
},
"hunyuan": {
"qkv_list": ["q_proj", "k_proj", "v_proj"],
"gate_up_list": ["gate_proj", "up_proj"],
"is_gate_up": True,
"moe_list": {
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
"down_list": ["mlp.w2", "down_proj"],
"is_merged": True
}
},
"phi3": {
"qkv_list": ["qkv_proj"],
"gate_up_list": ["gate_up_proj"],
"is_gate_up": True,
"moe_list": None
},
}
def get_layer_weight_bias_name(model_type, layer_name):
'''
Specially adjust the condition that layer_name and weight/bias name are different,
or the condithon that weight/bias name is not {layer_name}.weight/bias, such as:
if model_type == "chatglm" and "output_layer" in layer_name:
layer_name = "lm_head"
weight_name = f"{layer_name}_weight"
bias_name = f"{layer_name}_bias"
Since vllm 0.5.3, vllm has obey this rule, so no special layer needs to be modified.
'''
weight_name = None
bias_name = None
# layers which need to be modified can be listed at here
if model_type == "hunyuan" and "lm_head" in layer_name:
layer_name = "model.embed_tokens"
weight_name = "model.embed_tokens.weight"
bias_name = "model.embed_tokens.bias"
if weight_name is None:
weight_name = f"{layer_name}.weight"
if bias_name is None:
bias_name = f"{layer_name}.bias"
return layer_name, weight_name, bias_name
def modify_layer_weight_bias_name(model_type, named_parameters):
'''
modify special condition that vllm layer_name isn't same as hf layer name
'''
# Mapping for model type specific adjustments
mapping = {
"chatglm": {
"transformer.embedding.weight": "transformer.embedding.word_embeddings.weight"
},
}
if model_type in mapping:
for old_key, new_key in mapping[model_type].items():
if old_key in named_parameters:
named_parameters[new_key] = named_parameters.pop(old_key)
def extract_numbers(string):
'''
extract a string to number
'''
# 使用正则表达式找到字符串中的所有数字部分
matches = re.findall(r'\d+', string)
# 将所有匹配的数字部分转换为整数
numbers = [int(match) for match in matches]
return numbers[-1] if len(numbers) > 0 else 0
def get_qkv_distribution(model_type, model_version, hf_config):
'''
Get qkv distribution: n3sh or 3nsh
n3sh: [head_num, 3, head_size, hidden_size]
3nsh: [3, head_num, head_size, hidden_size]
vllm default qkv distribution is 3nsh, so here need to provide n3sh model info, tools will convert 3nsh to n3sh
to be same as hugging face qkv distribution
This is only for packge qkv layer and it's distribution is n3sh
'''
is_n3sh = False
head_num = 0
kv_head_num = 0
if (model_type == "chatglm" and extract_numbers(model_version) == 0) or model_type in ["bloom", "gpt_neox"]:
is_n3sh = True
head_num = hf_config.num_attention_heads
kv_head_num = head_num
if model_type == "falcon":
is_n3sh = True
head_num = hf_config.num_attention_heads
if hf_config.new_decoder_architecture:
kv_head_num = hf_config.num_kv_heads
elif hf_config.multi_query:
kv_head_num = 1
else:
kv_head_num = head_num
return is_n3sh, head_num, kv_head_num