From eb648f739863c301aeaf72725f9d14f7ed8be518 Mon Sep 17 00:00:00 2001 From: LoganJane <42287016+LoganJane@users.noreply.github.com> Date: Mon, 9 Mar 2026 16:07:16 +0800 Subject: [PATCH] [Bugfix] Support quant config in glm46v (#7062) ### What this PR does / why we need it? We need to support quant config in glm46v . ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? We used the 'Ascend/msit' quantization method to test the w8a8 weights. Successfully ran on NPU using vllm-ascend by the w8a8 weights. - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d Signed-off-by: g00887675/loganJane Co-authored-by: g00887675/loganJane --- vllm_ascend/quantization/modelslim_config.py | 34 ++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py index 3678c6f3..3e3b308a 100644 --- a/vllm_ascend/quantization/modelslim_config.py +++ b/vllm_ascend/quantization/modelslim_config.py @@ -77,6 +77,16 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = { "qwen2_5_omni_text": { "language_model.": "thinker.", }, + "glm4v_moe": { + "visual.": "model.visual.", + "language_model.lm_head.": "lm_head.", + "language_model.model.": "model.language_model.", + }, + "glm4v_moe_text": { + "visual.": "model.visual.", + "language_model.lm_head.": "lm_head.", + "language_model.model.": "model.language_model.", + }, } # key: model_type @@ -186,6 +196,30 @@ packed_modules_model_mapping: dict[str, dict[str, list[str]]] = { "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, + "glm4v_moe": { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], + }, + "glm4v_moe_text": { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], + }, "longcat_flash": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],