[Quant] GLM4.7-Flash Support W8A8 (#6492)

### What this PR does / why we need it?
support W8A8 quant for model GLM4.7-flash
### Does this PR introduce _any_ user-facing change?
Yes
### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0

Signed-off-by: dsxsteven <dsxsteven@sina.com>
Co-authored-by: SlightwindSec <slightwindsec@gmail.com>
This commit is contained in:
dsxsteven
2026-02-03 19:49:58 +08:00
committed by GitHub
parent 4d6444d5fd
commit a80e524fbc

View File

@@ -166,6 +166,12 @@ packed_modules_model_mapping: Dict[str, Dict[str, List[str]]] = {
"experts":
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
},
"glm4_moe_lite": {
"gate_up_proj": ["gate_proj", "up_proj"],
"experts":
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
},
"longcat_flash": {
"gate_up_proj": ["gate_proj", "up_proj"],
"experts":