forked from EngineX-Cambricon/enginex-mlu370-vllm
add deepseekv3 and llama4
This commit is contained in:
@@ -176,14 +176,14 @@ def vllm__llama4__Llama4Attention__forward(
|
|||||||
==================
|
==================
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# QK norm (教训 #2: use contiguous + reshape)
|
# QK norm (MLU fused_rms_norm requires matching dtypes, skip .float())
|
||||||
if self.qk_norm is not None:
|
if self.qk_norm is not None:
|
||||||
q = q.contiguous().reshape(-1, self.head_dim)
|
q = q.contiguous().reshape(-1, self.head_dim)
|
||||||
q = (self.qk_norm(q.float())
|
q = (self.qk_norm(q)
|
||||||
.contiguous().reshape(-1, self.q_size).to(q.dtype))
|
.contiguous().reshape(-1, self.q_size))
|
||||||
k = k.contiguous().reshape(-1, self.head_dim)
|
k = k.contiguous().reshape(-1, self.head_dim)
|
||||||
k = (self.qk_norm(k.float())
|
k = (self.qk_norm(k)
|
||||||
.contiguous().reshape(-1, self.kv_size).to(k.dtype))
|
.contiguous().reshape(-1, self.kv_size))
|
||||||
|
|
||||||
# Temperature tuning for NoPE layers
|
# Temperature tuning for NoPE layers
|
||||||
if self.attn_temperature_tuning and self.nope:
|
if self.attn_temperature_tuning and self.nope:
|
||||||
|
|||||||
Reference in New Issue
Block a user