From 38f34e359f08bcf652d2a95b2a2521880286d710 Mon Sep 17 00:00:00 2001 From: Pleaplusone <38376071+ganyi1996ppo@users.noreply.github.com> Date: Mon, 28 Apr 2025 08:53:06 +0800 Subject: [PATCH] [Fix] fix deepseek v0 attention eager mode (#671) ### What this PR does / why we need it? `reshape_and_cache_siso` seems have some funcitonality issues, use torch op combination replace this custom op by default. --------- Signed-off-by: ganyi --- vllm_ascend/attention/attention.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py index 8e5a1ba..3de391f 100644 --- a/vllm_ascend/attention/attention.py +++ b/vllm_ascend/attention/attention.py @@ -40,6 +40,7 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState, from vllm.config import get_current_vllm_config from vllm.utils import async_tensor_h2d, make_tensor_with_pad +from vllm_ascend.ops.cache import concat_and_cache_mla from vllm_ascend.worker.model_runner import ( ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata) @@ -1086,16 +1087,10 @@ class AscendMLAAttentionBackendImpl(MLAAttentionImpl): key_cache=kv_cache[0], value_cache=kv_cache[1], slot_indices=slots) - else: - if kv_cache.numel() > 0: - key = torch.cat([ - kv_c_normed.view(num_tokens, self.num_kv_heads, -1), k_pe - ], - dim=2) - slots = attn_metadata.slot_mapping - torch_npu._npu_reshape_and_cache_siso(key=key, - key_cache=kv_cache, - slot_indices=slots) + elif kv_cache.numel() > 0: + # TODO replace this naive implement with fusion kernel + concat_and_cache_mla(kv_c_normed, k_pe, kv_cache, + attn_metadata.slot_mapping) if attn_metadata.num_prefills > 0: attn_output = torch.empty(num_tokens,