diff --git a/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama.py b/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama.py index 19f0007..3d3c59d 100644 --- a/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama.py +++ b/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama.py @@ -74,18 +74,22 @@ def vllm__module_executor__models__llama__LlamaAttention__forward( smooth_quant_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states, smooth_quant_scale) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) ''' ============================= Modify by vllm_mlu ============================= @brief: pack q & k to fit tmo.apply_rotary + @optimization: avoid redundant split operation ''' if self.rope_scaling is not None and self.rope_scaling["rope_type"] == "longrope": + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) else: - qk, _ = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1) + # Optimized: split qkv into [qk, v] directly, avoiding redundant split + qk, v = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1) self.rotary_emb(positions, qk.view(-1, self.num_heads + self.num_kv_heads, self.head_dim)) + # Split qk into q and k after rotary embedding + q, k = qk.split([self.q_size, self.kv_size], dim=-1) ''' ================== End of MLU Hijack