add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/tools/quant_tools/model_special.py
+++ b/vllm-v0.6.2/tools/quant_tools/model_special.py
@@ -0,0 +1,206 @@
+import re
+
+# model_type, qkv_list, gate_up_list, is_gate_up
+smooth_model_config = {
+    "mllama": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "llama": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "qwen2_vl": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": None,
+        "skip_patterns": [r"^visual\.*"]
+    },
+    "qwen2": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "qwen": {
+        "qkv_list": ["c_attn"],
+        "gate_up_list": ["w2", "w1"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "baichuan": {
+        "qkv_list": ["W_pack"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "chatglm": {
+        "qkv_list": ["query_key_value"],
+        "gate_up_list": ["dense_h_to_4h"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "gpt_neox": {
+        "qkv_list": ["query_key_value"],
+        "gate_up_list": [],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "mixtral": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["w1", "w3"],
+        "is_gate_up": True,
+        "moe_list": {
+            "gate_up_list": ["block_sparse_moe.w13", "w1", "w3"],
+            "down_list": ["block_sparse_moe.w2", "w2"],
+            "is_merged": True
+        }
+    },
+    "qwen2_moe": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": {
+            "gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
+            "down_list": ["mlp.w2", "down_proj"],
+            "is_merged": True
+        }
+    },
+    "deepseek_v2": {
+        "qkv_list": ["q_proj", "q_b_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": {
+            "gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
+            "down_list": ["mlp.w2", "down_proj"],
+            "is_merged": True
+        },
+        "skip_patterns": [r".*\.kv_b_proj\..*",]
+    },
+    "falcon": {
+        "qkv_list": ["query_key_value"],
+        "gate_up_list": ["dense_h_to_4h"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "bloom": {
+        "qkv_list": ["query_key_value"],
+        "gate_up_list": ["dense_h_to_4h"],
+        "is_gate_up": False,
+        "moe_list": None
+    },
+    "internlm2": {
+        "qkv_list": ["wqkv"],
+        "gate_up_list": ["gate_up_proj"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+    "hunyuan": {
+        "qkv_list": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_list": ["gate_proj", "up_proj"],
+        "is_gate_up": True,
+        "moe_list": {
+            "gate_up_list": ["mlp.w1", "gate_proj", "up_proj"],
+            "down_list": ["mlp.w2", "down_proj"],
+            "is_merged": True
+        }
+    },
+    "phi3": {
+        "qkv_list": ["qkv_proj"],
+        "gate_up_list": ["gate_up_proj"],
+        "is_gate_up": True,
+        "moe_list": None
+    },
+}
+
+
+def get_layer_weight_bias_name(model_type, layer_name):
+    '''
+    Specially adjust the condition that layer_name and weight/bias name are different,
+    or the condithon that weight/bias name is not {layer_name}.weight/bias, such as:
+    if model_type == "chatglm" and "output_layer" in layer_name:
+        layer_name = "lm_head"
+        weight_name = f"{layer_name}_weight"
+        bias_name = f"{layer_name}_bias"
+    Since vllm 0.5.3, vllm has obey this rule, so no special layer needs to be modified.
+    '''
+    weight_name = None
+    bias_name = None
+
+    # layers which need to be modified can be listed at here
+    if model_type == "hunyuan" and "lm_head" in layer_name:
+        layer_name = "model.embed_tokens"
+        weight_name = "model.embed_tokens.weight"
+        bias_name = "model.embed_tokens.bias"
+
+    if weight_name is None:
+        weight_name = f"{layer_name}.weight"
+    if bias_name is None:
+        bias_name = f"{layer_name}.bias"
+
+    return layer_name, weight_name, bias_name
+
+
+def modify_layer_weight_bias_name(model_type, named_parameters):
+    '''
+    modify special condition that vllm layer_name isn't same as hf layer name
+    '''
+    # Mapping for model type specific adjustments
+    mapping = {
+        "chatglm": {
+            "transformer.embedding.weight": "transformer.embedding.word_embeddings.weight"
+        },
+    }
+
+    if model_type in mapping:
+        for old_key, new_key in mapping[model_type].items():
+            if old_key in named_parameters:
+                named_parameters[new_key] = named_parameters.pop(old_key)
+
+
+def extract_numbers(string):
+    '''
+    extract a string to number
+    '''
+    # 使用正则表达式找到字符串中的所有数字部分
+    matches = re.findall(r'\d+', string)
+
+    # 将所有匹配的数字部分转换为整数
+    numbers = [int(match) for match in matches]
+
+    return numbers[-1] if len(numbers) > 0 else 0
+
+
+def get_qkv_distribution(model_type, model_version, hf_config):
+    '''
+    Get qkv distribution: n3sh or 3nsh
+    n3sh: [head_num, 3, head_size, hidden_size]
+    3nsh: [3, head_num, head_size, hidden_size]
+    vllm default qkv distribution is 3nsh, so here need to provide n3sh model info, tools will convert 3nsh to n3sh
+    to be same as hugging face qkv distribution
+    This is only for packge qkv layer and it's distribution is n3sh
+    '''
+    is_n3sh = False
+    head_num = 0
+    kv_head_num = 0
+    if (model_type == "chatglm" and extract_numbers(model_version) == 0) or model_type in ["bloom", "gpt_neox"]:
+        is_n3sh = True
+        head_num = hf_config.num_attention_heads
+
+        kv_head_num = head_num
+    if model_type == "falcon":
+        is_n3sh = True
+        head_num = hf_config.num_attention_heads
+        if hf_config.new_decoder_architecture:
+            kv_head_num = hf_config.num_kv_heads
+        elif hf_config.multi_query:
+            kv_head_num = 1
+        else:
+            kv_head_num = head_num
+
+    return is_n3sh, head_num, kv_head_num