opt llama3
This commit is contained in:
@@ -74,18 +74,22 @@ def vllm__module_executor__models__llama__LlamaAttention__forward(
|
|||||||
smooth_quant_scale: Optional[torch.Tensor] = None,
|
smooth_quant_scale: Optional[torch.Tensor] = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
qkv, _ = self.qkv_proj(hidden_states, smooth_quant_scale)
|
qkv, _ = self.qkv_proj(hidden_states, smooth_quant_scale)
|
||||||
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
|
||||||
'''
|
'''
|
||||||
=============================
|
=============================
|
||||||
Modify by vllm_mlu
|
Modify by vllm_mlu
|
||||||
=============================
|
=============================
|
||||||
@brief: pack q & k to fit tmo.apply_rotary
|
@brief: pack q & k to fit tmo.apply_rotary
|
||||||
|
@optimization: avoid redundant split operation
|
||||||
'''
|
'''
|
||||||
if self.rope_scaling is not None and self.rope_scaling["rope_type"] == "longrope":
|
if self.rope_scaling is not None and self.rope_scaling["rope_type"] == "longrope":
|
||||||
|
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
||||||
q, k = self.rotary_emb(positions, q, k)
|
q, k = self.rotary_emb(positions, q, k)
|
||||||
else:
|
else:
|
||||||
qk, _ = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
|
# Optimized: split qkv into [qk, v] directly, avoiding redundant split
|
||||||
|
qk, v = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
|
||||||
self.rotary_emb(positions, qk.view(-1, self.num_heads + self.num_kv_heads, self.head_dim))
|
self.rotary_emb(positions, qk.view(-1, self.num_heads + self.num_kv_heads, self.head_dim))
|
||||||
|
# Split qk into q and k after rotary embedding
|
||||||
|
q, k = qk.split([self.q_size, self.kv_size], dim=-1)
|
||||||
'''
|
'''
|
||||||
==================
|
==================
|
||||||
End of MLU Hijack
|
End of MLU Hijack
|
||||||
|
|||||||
Reference in New Issue
Block a user