Fix loading KV quantization scale; Enable modelopt kv cache (#4686)
Co-authored-by: qingquansong <ustcsqq@gmail.com>
This commit is contained in:
@@ -489,6 +489,7 @@ class DeepseekV2Attention(nn.Module):
|
||||
self.scaling,
|
||||
num_kv_heads=self.num_local_heads,
|
||||
layer_id=layer_id,
|
||||
quant_config=quant_config,
|
||||
prefix=add_prefix("attn", prefix),
|
||||
)
|
||||
|
||||
@@ -669,6 +670,7 @@ class DeepseekV2AttentionMLA(nn.Module):
|
||||
num_kv_heads=1,
|
||||
layer_id=layer_id,
|
||||
v_head_dim=self.kv_lora_rank,
|
||||
quant_config=quant_config,
|
||||
prefix=add_prefix("attn_mqa", prefix),
|
||||
)
|
||||
|
||||
@@ -679,6 +681,7 @@ class DeepseekV2AttentionMLA(nn.Module):
|
||||
num_kv_heads=self.num_local_heads,
|
||||
layer_id=layer_id,
|
||||
v_head_dim=self.v_head_dim,
|
||||
quant_config=quant_config,
|
||||
prefix=add_prefix("attn_mha", prefix),
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user