forked from EngineX-Cambricon/enginex-mlu370-vllm
207 lines
6.3 KiB
Python
Executable File
207 lines
6.3 KiB
Python
Executable File
import re
|
|
|
|
# model_type, qkv_list, gate_up_list, is_gate_up
|
|
smooth_model_config = {
|
|
"mllama": {
|
|
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
|
"gate_up_list": ["gate_proj", "up_proj"],
|
|
"is_gate_up": True,
|
|
"moe_list": None
|
|
},
|
|
"llama": {
|
|
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
|
"gate_up_list": ["gate_proj", "up_proj"],
|
|
"is_gate_up": True,
|
|
"moe_list": None
|
|
},
|
|
"qwen2_vl": {
|
|
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
|
"gate_up_list": ["gate_proj", "up_proj"],
|
|
"is_gate_up": True,
|
|
"moe_list": None,
|
|
"skip_patterns": [r"^visual\.*"]
|
|
},
|
|
"qwen2": {
|
|
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
|
"gate_up_list": ["gate_proj", "up_proj"],
|
|
"is_gate_up": True,
|
|
"moe_list": None
|
|
},
|
|
"qwen": {
|
|
"qkv_list": ["c_attn"],
|
|
"gate_up_list": ["w2", "w1"],
|
|
"is_gate_up": True,
|
|
"moe_list": None
|
|
},
|
|
"baichuan": {
|
|
"qkv_list": ["W_pack"],
|
|
"gate_up_list": ["gate_proj", "up_proj"],
|
|
"is_gate_up": True,
|
|
"moe_list": None
|
|
},
|
|
"chatglm": {
|
|
"qkv_list": ["query_key_value"],
|
|
"gate_up_list": ["dense_h_to_4h"],
|
|
"is_gate_up": True,
|
|
"moe_list": None
|
|
},
|
|
"gpt_neox": {
|
|
"qkv_list": ["query_key_value"],
|
|
"gate_up_list": [],
|
|
"is_gate_up": True,
|
|
"moe_list": None
|
|
},
|
|
"mixtral": {
|
|
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
|
"gate_up_list": ["w1", "w3"],
|
|
"is_gate_up": True,
|
|
"moe_list": {
|
|
"gate_up_list": ["block_sparse_moe.w13", "w1", "w3"],
|
|
"down_list": ["block_sparse_moe.w2", "w2"],
|
|
"is_merged": True
|
|
}
|
|
},
|
|
"qwen2_moe": {
|
|
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
|
"gate_up_list": ["gate_proj", "up_proj"],
|
|
"is_gate_up": True,
|
|
"moe_list": {
|
|
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
|
|
"down_list": ["mlp.w2", "down_proj"],
|
|
"is_merged": True
|
|
}
|
|
},
|
|
"deepseek_v2": {
|
|
"qkv_list": ["q_proj", "q_b_proj"],
|
|
"gate_up_list": ["gate_proj", "up_proj"],
|
|
"is_gate_up": True,
|
|
"moe_list": {
|
|
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
|
|
"down_list": ["mlp.w2", "down_proj"],
|
|
"is_merged": True
|
|
},
|
|
"skip_patterns": [r".*\.kv_b_proj\..*",]
|
|
},
|
|
"falcon": {
|
|
"qkv_list": ["query_key_value"],
|
|
"gate_up_list": ["dense_h_to_4h"],
|
|
"is_gate_up": True,
|
|
"moe_list": None
|
|
},
|
|
"bloom": {
|
|
"qkv_list": ["query_key_value"],
|
|
"gate_up_list": ["dense_h_to_4h"],
|
|
"is_gate_up": False,
|
|
"moe_list": None
|
|
},
|
|
"internlm2": {
|
|
"qkv_list": ["wqkv"],
|
|
"gate_up_list": ["gate_up_proj"],
|
|
"is_gate_up": True,
|
|
"moe_list": None
|
|
},
|
|
"hunyuan": {
|
|
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
|
"gate_up_list": ["gate_proj", "up_proj"],
|
|
"is_gate_up": True,
|
|
"moe_list": {
|
|
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
|
|
"down_list": ["mlp.w2", "down_proj"],
|
|
"is_merged": True
|
|
}
|
|
},
|
|
"phi3": {
|
|
"qkv_list": ["qkv_proj"],
|
|
"gate_up_list": ["gate_up_proj"],
|
|
"is_gate_up": True,
|
|
"moe_list": None
|
|
},
|
|
}
|
|
|
|
|
|
def get_layer_weight_bias_name(model_type, layer_name):
|
|
'''
|
|
Specially adjust the condition that layer_name and weight/bias name are different,
|
|
or the condithon that weight/bias name is not {layer_name}.weight/bias, such as:
|
|
if model_type == "chatglm" and "output_layer" in layer_name:
|
|
layer_name = "lm_head"
|
|
weight_name = f"{layer_name}_weight"
|
|
bias_name = f"{layer_name}_bias"
|
|
Since vllm 0.5.3, vllm has obey this rule, so no special layer needs to be modified.
|
|
'''
|
|
weight_name = None
|
|
bias_name = None
|
|
|
|
# layers which need to be modified can be listed at here
|
|
if model_type == "hunyuan" and "lm_head" in layer_name:
|
|
layer_name = "model.embed_tokens"
|
|
weight_name = "model.embed_tokens.weight"
|
|
bias_name = "model.embed_tokens.bias"
|
|
|
|
if weight_name is None:
|
|
weight_name = f"{layer_name}.weight"
|
|
if bias_name is None:
|
|
bias_name = f"{layer_name}.bias"
|
|
|
|
return layer_name, weight_name, bias_name
|
|
|
|
|
|
def modify_layer_weight_bias_name(model_type, named_parameters):
|
|
'''
|
|
modify special condition that vllm layer_name isn't same as hf layer name
|
|
'''
|
|
# Mapping for model type specific adjustments
|
|
mapping = {
|
|
"chatglm": {
|
|
"transformer.embedding.weight": "transformer.embedding.word_embeddings.weight"
|
|
},
|
|
}
|
|
|
|
if model_type in mapping:
|
|
for old_key, new_key in mapping[model_type].items():
|
|
if old_key in named_parameters:
|
|
named_parameters[new_key] = named_parameters.pop(old_key)
|
|
|
|
|
|
def extract_numbers(string):
|
|
'''
|
|
extract a string to number
|
|
'''
|
|
# 使用正则表达式找到字符串中的所有数字部分
|
|
matches = re.findall(r'\d+', string)
|
|
|
|
# 将所有匹配的数字部分转换为整数
|
|
numbers = [int(match) for match in matches]
|
|
|
|
return numbers[-1] if len(numbers) > 0 else 0
|
|
|
|
|
|
def get_qkv_distribution(model_type, model_version, hf_config):
|
|
'''
|
|
Get qkv distribution: n3sh or 3nsh
|
|
n3sh: [head_num, 3, head_size, hidden_size]
|
|
3nsh: [3, head_num, head_size, hidden_size]
|
|
vllm default qkv distribution is 3nsh, so here need to provide n3sh model info, tools will convert 3nsh to n3sh
|
|
to be same as hugging face qkv distribution
|
|
This is only for packge qkv layer and it's distribution is n3sh
|
|
'''
|
|
is_n3sh = False
|
|
head_num = 0
|
|
kv_head_num = 0
|
|
if (model_type == "chatglm" and extract_numbers(model_version) == 0) or model_type in ["bloom", "gpt_neox"]:
|
|
is_n3sh = True
|
|
head_num = hf_config.num_attention_heads
|
|
|
|
kv_head_num = head_num
|
|
if model_type == "falcon":
|
|
is_n3sh = True
|
|
head_num = hf_config.num_attention_heads
|
|
if hf_config.new_decoder_architecture:
|
|
kv_head_num = hf_config.num_kv_heads
|
|
elif hf_config.multi_query:
|
|
kv_head_num = 1
|
|
else:
|
|
kv_head_num = head_num
|
|
|
|
return is_n3sh, head_num, kv_head_num
|