[Fix] fix deepseek v0 attention eager mode (#671)
### What this PR does / why we need it? `reshape_and_cache_siso` seems have some funcitonality issues, use torch op combination replace this custom op by default. --------- Signed-off-by: ganyi <pleaplusone.gy@gmail.com>
This commit is contained in:
@@ -40,6 +40,7 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState,
|
|||||||
from vllm.config import get_current_vllm_config
|
from vllm.config import get_current_vllm_config
|
||||||
from vllm.utils import async_tensor_h2d, make_tensor_with_pad
|
from vllm.utils import async_tensor_h2d, make_tensor_with_pad
|
||||||
|
|
||||||
|
from vllm_ascend.ops.cache import concat_and_cache_mla
|
||||||
from vllm_ascend.worker.model_runner import (
|
from vllm_ascend.worker.model_runner import (
|
||||||
ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
|
ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
|
||||||
|
|
||||||
@@ -1086,16 +1087,10 @@ class AscendMLAAttentionBackendImpl(MLAAttentionImpl):
|
|||||||
key_cache=kv_cache[0],
|
key_cache=kv_cache[0],
|
||||||
value_cache=kv_cache[1],
|
value_cache=kv_cache[1],
|
||||||
slot_indices=slots)
|
slot_indices=slots)
|
||||||
else:
|
elif kv_cache.numel() > 0:
|
||||||
if kv_cache.numel() > 0:
|
# TODO replace this naive implement with fusion kernel
|
||||||
key = torch.cat([
|
concat_and_cache_mla(kv_c_normed, k_pe, kv_cache,
|
||||||
kv_c_normed.view(num_tokens, self.num_kv_heads, -1), k_pe
|
attn_metadata.slot_mapping)
|
||||||
],
|
|
||||||
dim=2)
|
|
||||||
slots = attn_metadata.slot_mapping
|
|
||||||
torch_npu._npu_reshape_and_cache_siso(key=key,
|
|
||||||
key_cache=kv_cache,
|
|
||||||
slot_indices=slots)
|
|
||||||
|
|
||||||
if attn_metadata.num_prefills > 0:
|
if attn_metadata.num_prefills > 0:
|
||||||
attn_output = torch.empty(num_tokens,
|
attn_output = torch.empty(num_tokens,
|
||||||
|
|||||||
Reference in New Issue
Block a user