[Quant] GLM4.7-Flash Support W8A8 (#6492)
### What this PR does / why we need it? support W8A8 quant for model GLM4.7-flash ### Does this PR introduce _any_ user-facing change? Yes ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 Signed-off-by: dsxsteven <dsxsteven@sina.com> Co-authored-by: SlightwindSec <slightwindsec@gmail.com>
This commit is contained in:
@@ -166,6 +166,12 @@ packed_modules_model_mapping: Dict[str, Dict[str, List[str]]] = {
|
|||||||
"experts":
|
"experts":
|
||||||
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
|
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
|
||||||
},
|
},
|
||||||
|
"glm4_moe_lite": {
|
||||||
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||||
|
"experts":
|
||||||
|
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
||||||
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
|
||||||
|
},
|
||||||
"longcat_flash": {
|
"longcat_flash": {
|
||||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||||
"experts":
|
"experts":
|
||||||
|
|||||||
Reference in New Issue
Block a user