[Core] Support the features of prefix cache and chunked prefill in v0/v1 (#782)

### What this PR does / why we need it? Support the features of prefix cache and chunked prefill in v0/v1. --------- Signed-off-by: rjg-lyh <1318825571@qq.com>
2025-05-09 16:39:28 +08:00
parent 324f819b92
commit fa99f89e93
6 changed files with 156 additions and 32 deletions
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -417,7 +417,7 @@ class AscendMLAImpl(MLAAttentionImpl):

        num_tokens = query.size(0)
        attn_output = None
-        # Here is only 2 possibility of input, ChunkedPrefill or PrefillOnly
+        # Here is only 2 possibility of input, ChunkedPrefill or PrefillNoCache
        if attn_metadata.attn_state == AscendAttentionState.ChunkedPrefill:
            attn_output = torch.empty(num_tokens,
                                      self.num_heads * self.v_head_dim,
@@ -440,7 +440,7 @@ class AscendMLAImpl(MLAAttentionImpl):
                scale=self.scale,
                alibi_slopes=None,
                causal=True)
-        elif attn_metadata.attn_state == AscendAttentionState.PrefillOnly:
+        elif attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
            attn_output = torch.empty(num_tokens,
                                      self.num_heads,
                                      self.padding_head_dim,
@@ -479,7 +479,7 @@ class AscendMLAImpl(MLAAttentionImpl):
                self.padding_head_dim)[:, :, :self.v_head_dim]
        else:
            raise RuntimeError(
-                "Unexpected path reached, AscendMLAImpl should only have PrefillOnly and ChunkedPrefill scenario in forward prefill, please file a bug to vllm-ascend !"
+                "Unexpected path reached, AscendMLAImpl should only have PrefillNoCache and ChunkedPrefill scenario in forward prefill, please file a bug to vllm-ascend !"
            )
        attn_output = attn_output.reshape(
            [num_tokens, self.num_heads * self.v_head_dim])