[Bugfix] Support quant config in glm46v (#7062)
### What this PR does / why we need it?
We need to support quant config in glm46v
.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
We used the 'Ascend/msit' quantization method to test the w8a8 weights.
Successfully ran on NPU using vllm-ascend by the w8a8 weights.
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
Signed-off-by: g00887675/loganJane <g00887675/loganJane73@hotmail.com>
Co-authored-by: g00887675/loganJane <g00887675/loganJane73@hotmail.com>
This commit is contained in:
@@ -77,6 +77,16 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = {
|
||||
"qwen2_5_omni_text": {
|
||||
"language_model.": "thinker.",
|
||||
},
|
||||
"glm4v_moe": {
|
||||
"visual.": "model.visual.",
|
||||
"language_model.lm_head.": "lm_head.",
|
||||
"language_model.model.": "model.language_model.",
|
||||
},
|
||||
"glm4v_moe_text": {
|
||||
"visual.": "model.visual.",
|
||||
"language_model.lm_head.": "lm_head.",
|
||||
"language_model.model.": "model.language_model.",
|
||||
},
|
||||
}
|
||||
|
||||
# key: model_type
|
||||
@@ -186,6 +196,30 @@ packed_modules_model_mapping: dict[str, dict[str, list[str]]] = {
|
||||
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
||||
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
||||
},
|
||||
"glm4v_moe": {
|
||||
"qkv_proj": [
|
||||
"q_proj",
|
||||
"k_proj",
|
||||
"v_proj",
|
||||
],
|
||||
"gate_up_proj": [
|
||||
"gate_proj",
|
||||
"up_proj",
|
||||
],
|
||||
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
||||
},
|
||||
"glm4v_moe_text": {
|
||||
"qkv_proj": [
|
||||
"q_proj",
|
||||
"k_proj",
|
||||
"v_proj",
|
||||
],
|
||||
"gate_up_proj": [
|
||||
"gate_proj",
|
||||
"up_proj",
|
||||
],
|
||||
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
||||
},
|
||||
"longcat_flash": {
|
||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
||||
|
||||
Reference in New Issue
Block a user