[Bugfix] Support for mlapo in deepseekv3.1 w4a8 (#4828)
### What this PR does / why we need it?
Support for mlapo in deepseekv3.1 w4a8, since the csrc of mlapo requires the input args `enable_inner_out` and `inner_out`, we add the dummy args here
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: GDzhu01 <809721801@qq.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
@@ -1064,7 +1064,8 @@ class AscendMLAImpl(MLAAttentionImpl):
|
||||
|
||||
device = self.q_proj.weight.device
|
||||
self.gamma1 = self.q_a_layernorm.weight.data
|
||||
self.beta1 = self.q_a_layernorm.bias.data
|
||||
self.beta1 = torch.zeros_like(self.gamma1) if (
|
||||
_bias := self.q_a_layernorm.bias) is None else _bias.data
|
||||
self.gamma2 = self.kv_a_layernorm.weight.data
|
||||
self.quant_scale0 = self.fused_qkv_a_proj.input_scale.data
|
||||
self.quant_offset0 = self.fused_qkv_a_proj.input_offset.data
|
||||
@@ -1460,7 +1461,8 @@ class AscendMLAImpl(MLAAttentionImpl):
|
||||
kv_cache_out0=decode_k_nope,
|
||||
q_out1=decode_q_pe,
|
||||
kv_cache_out1=decode_k_pe,
|
||||
)
|
||||
enable_inner_out=False,
|
||||
inner_out=torch.tensor([], device=hidden_states.device))
|
||||
decode_q_nope = decode_q_nope.view(bsz, self.num_heads,
|
||||
self.kv_lora_rank)
|
||||
decode_q_pe = decode_q_pe.view(bsz, self.num_heads, -1)
|
||||
|
||||
Reference in New Issue
Block a user