[Fix] fix deepseek v0 attention eager mode (#671)

### What this PR does / why we need it?
`reshape_and_cache_siso` seems have some funcitonality issues, use torch
op combination replace this custom op by default.


---------

Signed-off-by: ganyi <pleaplusone.gy@gmail.com>
This commit is contained in:
Pleaplusone
2025-04-28 08:53:06 +08:00
committed by GitHub
parent 413657ae43
commit 38f34e359f

View File

@@ -40,6 +40,7 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState,
from vllm.config import get_current_vllm_config
from vllm.utils import async_tensor_h2d, make_tensor_with_pad
from vllm_ascend.ops.cache import concat_and_cache_mla
from vllm_ascend.worker.model_runner import (
ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
@@ -1086,16 +1087,10 @@ class AscendMLAAttentionBackendImpl(MLAAttentionImpl):
key_cache=kv_cache[0],
value_cache=kv_cache[1],
slot_indices=slots)
else:
if kv_cache.numel() > 0:
key = torch.cat([
kv_c_normed.view(num_tokens, self.num_kv_heads, -1), k_pe
],
dim=2)
slots = attn_metadata.slot_mapping
torch_npu._npu_reshape_and_cache_siso(key=key,
key_cache=kv_cache,
slot_indices=slots)
elif kv_cache.numel() > 0:
# TODO replace this naive implement with fusion kernel
concat_and_cache_mla(kv_c_normed, k_pe, kv_cache,
attn_metadata.slot_mapping)
if attn_metadata.num_prefills > 0:
attn_output = torch.empty(num_tokens,