From 38f34e359f08bcf652d2a95b2a2521880286d710 Mon Sep 17 00:00:00 2001
From: Pleaplusone <38376071+ganyi1996ppo@users.noreply.github.com>
Date: Mon, 28 Apr 2025 08:53:06 +0800
Subject: [PATCH] [Fix] fix deepseek v0 attention eager mode (#671)

### What this PR does / why we need it?
`reshape_and_cache_siso` seems have some funcitonality issues, use torch
op combination replace this custom op by default.


---------

Signed-off-by: ganyi <pleaplusone.gy@gmail.com>
---
 vllm_ascend/attention/attention.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py
index 8e5a1ba..3de391f 100644
--- a/vllm_ascend/attention/attention.py
+++ b/vllm_ascend/attention/attention.py
@@ -40,6 +40,7 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState,
 from vllm.config import get_current_vllm_config
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
+from vllm_ascend.ops.cache import concat_and_cache_mla
 from vllm_ascend.worker.model_runner import (
     ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
 
@@ -1086,16 +1087,10 @@ class AscendMLAAttentionBackendImpl(MLAAttentionImpl):
                                                  key_cache=kv_cache[0],
                                                  value_cache=kv_cache[1],
                                                  slot_indices=slots)
-        else:
-            if kv_cache.numel() > 0:
-                key = torch.cat([
-                    kv_c_normed.view(num_tokens, self.num_kv_heads, -1), k_pe
-                ],
-                                dim=2)
-                slots = attn_metadata.slot_mapping
-                torch_npu._npu_reshape_and_cache_siso(key=key,
-                                                      key_cache=kv_cache,
-                                                      slot_indices=slots)
+        elif kv_cache.numel() > 0:
+            # TODO replace this naive implement with fusion kernel
+            concat_and_cache_mla(kv_c_normed, k_pe, kv_cache,
+                                 attn_metadata.slot_mapping)
 
         if attn_metadata.num_prefills > 0:
             attn_output = torch.empty(num_tokens,