forked from EngineX-Cambricon/enginex-mlu370-vllm
opt llama3
This commit is contained in:
@@ -74,18 +74,22 @@ def vllm__module_executor__models__llama__LlamaAttention__forward(
|
||||
smooth_quant_scale: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
qkv, _ = self.qkv_proj(hidden_states, smooth_quant_scale)
|
||||
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: pack q & k to fit tmo.apply_rotary
|
||||
@optimization: avoid redundant split operation
|
||||
'''
|
||||
if self.rope_scaling is not None and self.rope_scaling["rope_type"] == "longrope":
|
||||
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
||||
q, k = self.rotary_emb(positions, q, k)
|
||||
else:
|
||||
qk, _ = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
|
||||
# Optimized: split qkv into [qk, v] directly, avoiding redundant split
|
||||
qk, v = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
|
||||
self.rotary_emb(positions, qk.view(-1, self.num_heads + self.num_kv_heads, self.head_dim))
|
||||
# Split qk into q and k after rotary embedding
|
||||
q, k = qk.split([self.q_size, self.kv_size], dim=-1)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
|
||||
Reference in New Issue
Block a user