[Bugfix] Support quant config in glm46v (#7062)

### What this PR does / why we need it?
We need to support quant config in glm46v
.
### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
We used the 'Ascend/msit' quantization method to test the w8a8 weights.
Successfully ran on NPU using vllm-ascend by the w8a8 weights.

- vLLM version: v0.16.0
- vLLM main:
4034c3d32e

Signed-off-by: g00887675/loganJane <g00887675/loganJane73@hotmail.com>
Co-authored-by: g00887675/loganJane <g00887675/loganJane73@hotmail.com>
This commit is contained in:
LoganJane
2026-03-09 16:07:16 +08:00
committed by GitHub
parent 57c554a23f
commit eb648f7398

View File

@@ -77,6 +77,16 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = {
"qwen2_5_omni_text": {
"language_model.": "thinker.",
},
"glm4v_moe": {
"visual.": "model.visual.",
"language_model.lm_head.": "lm_head.",
"language_model.model.": "model.language_model.",
},
"glm4v_moe_text": {
"visual.": "model.visual.",
"language_model.lm_head.": "lm_head.",
"language_model.model.": "model.language_model.",
},
}
# key: model_type
@@ -186,6 +196,30 @@ packed_modules_model_mapping: dict[str, dict[str, list[str]]] = {
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
},
"glm4v_moe": {
"qkv_proj": [
"q_proj",
"k_proj",
"v_proj",
],
"gate_up_proj": [
"gate_proj",
"up_proj",
],
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
},
"glm4v_moe_text": {
"qkv_proj": [
"q_proj",
"k_proj",
"v_proj",
],
"gate_up_proj": [
"gate_proj",
"up_proj",
],
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
},
"longcat_flash": {
"gate_up_proj": ["gate_proj", "up_proj"],
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],