add qwen3
This commit is contained in:
206
vllm-v0.6.2/tools/quant_tools/model_special.py
Executable file
206
vllm-v0.6.2/tools/quant_tools/model_special.py
Executable file
@@ -0,0 +1,206 @@
|
||||
import re
|
||||
|
||||
# model_type, qkv_list, gate_up_list, is_gate_up
|
||||
smooth_model_config = {
|
||||
"mllama": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"llama": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"qwen2_vl": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None,
|
||||
"skip_patterns": [r"^visual\.*"]
|
||||
},
|
||||
"qwen2": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"qwen": {
|
||||
"qkv_list": ["c_attn"],
|
||||
"gate_up_list": ["w2", "w1"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"baichuan": {
|
||||
"qkv_list": ["W_pack"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"chatglm": {
|
||||
"qkv_list": ["query_key_value"],
|
||||
"gate_up_list": ["dense_h_to_4h"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"gpt_neox": {
|
||||
"qkv_list": ["query_key_value"],
|
||||
"gate_up_list": [],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"mixtral": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["w1", "w3"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": {
|
||||
"gate_up_list": ["block_sparse_moe.w13", "w1", "w3"],
|
||||
"down_list": ["block_sparse_moe.w2", "w2"],
|
||||
"is_merged": True
|
||||
}
|
||||
},
|
||||
"qwen2_moe": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": {
|
||||
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
|
||||
"down_list": ["mlp.w2", "down_proj"],
|
||||
"is_merged": True
|
||||
}
|
||||
},
|
||||
"deepseek_v2": {
|
||||
"qkv_list": ["q_proj", "q_b_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": {
|
||||
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
|
||||
"down_list": ["mlp.w2", "down_proj"],
|
||||
"is_merged": True
|
||||
},
|
||||
"skip_patterns": [r".*\.kv_b_proj\..*",]
|
||||
},
|
||||
"falcon": {
|
||||
"qkv_list": ["query_key_value"],
|
||||
"gate_up_list": ["dense_h_to_4h"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"bloom": {
|
||||
"qkv_list": ["query_key_value"],
|
||||
"gate_up_list": ["dense_h_to_4h"],
|
||||
"is_gate_up": False,
|
||||
"moe_list": None
|
||||
},
|
||||
"internlm2": {
|
||||
"qkv_list": ["wqkv"],
|
||||
"gate_up_list": ["gate_up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
"hunyuan": {
|
||||
"qkv_list": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_list": ["gate_proj", "up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": {
|
||||
"gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
|
||||
"down_list": ["mlp.w2", "down_proj"],
|
||||
"is_merged": True
|
||||
}
|
||||
},
|
||||
"phi3": {
|
||||
"qkv_list": ["qkv_proj"],
|
||||
"gate_up_list": ["gate_up_proj"],
|
||||
"is_gate_up": True,
|
||||
"moe_list": None
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def get_layer_weight_bias_name(model_type, layer_name):
|
||||
'''
|
||||
Specially adjust the condition that layer_name and weight/bias name are different,
|
||||
or the condithon that weight/bias name is not {layer_name}.weight/bias, such as:
|
||||
if model_type == "chatglm" and "output_layer" in layer_name:
|
||||
layer_name = "lm_head"
|
||||
weight_name = f"{layer_name}_weight"
|
||||
bias_name = f"{layer_name}_bias"
|
||||
Since vllm 0.5.3, vllm has obey this rule, so no special layer needs to be modified.
|
||||
'''
|
||||
weight_name = None
|
||||
bias_name = None
|
||||
|
||||
# layers which need to be modified can be listed at here
|
||||
if model_type == "hunyuan" and "lm_head" in layer_name:
|
||||
layer_name = "model.embed_tokens"
|
||||
weight_name = "model.embed_tokens.weight"
|
||||
bias_name = "model.embed_tokens.bias"
|
||||
|
||||
if weight_name is None:
|
||||
weight_name = f"{layer_name}.weight"
|
||||
if bias_name is None:
|
||||
bias_name = f"{layer_name}.bias"
|
||||
|
||||
return layer_name, weight_name, bias_name
|
||||
|
||||
|
||||
def modify_layer_weight_bias_name(model_type, named_parameters):
|
||||
'''
|
||||
modify special condition that vllm layer_name isn't same as hf layer name
|
||||
'''
|
||||
# Mapping for model type specific adjustments
|
||||
mapping = {
|
||||
"chatglm": {
|
||||
"transformer.embedding.weight": "transformer.embedding.word_embeddings.weight"
|
||||
},
|
||||
}
|
||||
|
||||
if model_type in mapping:
|
||||
for old_key, new_key in mapping[model_type].items():
|
||||
if old_key in named_parameters:
|
||||
named_parameters[new_key] = named_parameters.pop(old_key)
|
||||
|
||||
|
||||
def extract_numbers(string):
|
||||
'''
|
||||
extract a string to number
|
||||
'''
|
||||
# 使用正则表达式找到字符串中的所有数字部分
|
||||
matches = re.findall(r'\d+', string)
|
||||
|
||||
# 将所有匹配的数字部分转换为整数
|
||||
numbers = [int(match) for match in matches]
|
||||
|
||||
return numbers[-1] if len(numbers) > 0 else 0
|
||||
|
||||
|
||||
def get_qkv_distribution(model_type, model_version, hf_config):
|
||||
'''
|
||||
Get qkv distribution: n3sh or 3nsh
|
||||
n3sh: [head_num, 3, head_size, hidden_size]
|
||||
3nsh: [3, head_num, head_size, hidden_size]
|
||||
vllm default qkv distribution is 3nsh, so here need to provide n3sh model info, tools will convert 3nsh to n3sh
|
||||
to be same as hugging face qkv distribution
|
||||
This is only for packge qkv layer and it's distribution is n3sh
|
||||
'''
|
||||
is_n3sh = False
|
||||
head_num = 0
|
||||
kv_head_num = 0
|
||||
if (model_type == "chatglm" and extract_numbers(model_version) == 0) or model_type in ["bloom", "gpt_neox"]:
|
||||
is_n3sh = True
|
||||
head_num = hf_config.num_attention_heads
|
||||
|
||||
kv_head_num = head_num
|
||||
if model_type == "falcon":
|
||||
is_n3sh = True
|
||||
head_num = hf_config.num_attention_heads
|
||||
if hf_config.new_decoder_architecture:
|
||||
kv_head_num = hf_config.num_kv_heads
|
||||
elif hf_config.multi_query:
|
||||
kv_head_num = 1
|
||||
else:
|
||||
kv_head_num = head_num
|
||||
|
||||
return is_n3sh, head_num, kv_head_num
|
||||
Reference in New Issue
Block a user