diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py index 3678c6f3..3e3b308a 100644 --- a/vllm_ascend/quantization/modelslim_config.py +++ b/vllm_ascend/quantization/modelslim_config.py @@ -77,6 +77,16 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = { "qwen2_5_omni_text": { "language_model.": "thinker.", }, + "glm4v_moe": { + "visual.": "model.visual.", + "language_model.lm_head.": "lm_head.", + "language_model.model.": "model.language_model.", + }, + "glm4v_moe_text": { + "visual.": "model.visual.", + "language_model.lm_head.": "lm_head.", + "language_model.model.": "model.language_model.", + }, } # key: model_type @@ -186,6 +196,30 @@ packed_modules_model_mapping: dict[str, dict[str, list[str]]] = { "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, + "glm4v_moe": { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], + }, + "glm4v_moe_text": { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], + }, "longcat_flash": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],