Fix loading KV quantization scale; Enable modelopt kv cache (#4686)

Co-authored-by: qingquansong <ustcsqq@gmail.com>
This commit is contained in:
Yun Dai
2025-04-08 09:11:35 -07:00
committed by GitHub
parent 88d6fd9a11
commit 2695ab0537
38 changed files with 151 additions and 76 deletions

View File

@@ -192,6 +192,7 @@ class MiniCPM3Attention(nn.Module):
self.scaling,
num_kv_heads=self.num_local_heads,
layer_id=layer_id,
quant_config=quant_config,
prefix=add_prefix("attn", prefix),
)
@@ -343,6 +344,7 @@ class MiniCPM3AttentionMLA(nn.Module):
num_kv_heads=1,
layer_id=layer_id,
v_head_dim=self.kv_lora_rank,
quant_config=quant_config,
prefix=add_prefix("attn", prefix),
)