[Feat] 310p supports PrefillCacheHit State (#6756)

### What this PR does / why we need it? This PR extends the Ascend 310P attention backend to support the `PrefillCacheHit` state. Previously, only `PrefillNoCache`, `DecodeOnly`, and `ChunkedPrefill` were supported. This PR handles this state by routing it to the existing `forward_chunked_prefill_310` implementation, which is suitable for this scenario. The changes also include refactoring the main `forward_impl` dispatch method for better clarity and updating unit tests to cover the new state and ensure correctness. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Accuracy test when chunked prefill is disabled. - vLLM version: v0.15.0 - vLLM main: 9562912cea --------- Signed-off-by: pu-zhe <zpuaa@outlook.com>
2026-02-24 16:48:05 +08:00
parent 62ea664aa7
commit a8e951e6f5
3 changed files with 169 additions and 24 deletions
--- a/vllm_ascend/_310p/attention/attention_v1.py
+++ b/vllm_ascend/_310p/attention/attention_v1.py
@@ -198,6 +198,8 @@ class AscendAttentionBackendImpl310(AscendAttentionBackendImpl):
            out=output,
        )

+        return output
+
    def forward_impl(self, query, key, value, kv_cache, attn_metadata, output):
        """
        Main dispatch method for attention operations.
@@ -218,22 +220,19 @@ class AscendAttentionBackendImpl310(AscendAttentionBackendImpl):
            NotImplementedError: If the attention state is not supported on 310P.
        """
        state = attn_metadata.attn_state
-
-        if state == AscendAttentionState.DecodeOnly:
-            return self.forward_paged_attention(query, attn_metadata, output)
-
+        # Condition for PrefillNoCache: No previous tokens have been processed yet
        if state == AscendAttentionState.PrefillNoCache:
-            out = self.forward_prefill_310(query, key, value, attn_metadata, output)
-            return out
-
-        if state == AscendAttentionState.ChunkedPrefill:
-            self.forward_chunked_prefill_310(query, attn_metadata, output)
-            return output
-
-        raise NotImplementedError(
-            f"{self.__class__.__name__}.forward_impl: 310P only supports "
-            f"{AscendAttentionState.DecodeOnly.name}, "
-            f"{AscendAttentionState.PrefillNoCache.name}, "
-            f"{AscendAttentionState.ChunkedPrefill.name}, "
-            f"got {state!r}."
-        )
+            output = self.forward_prefill_310(query, key, value, attn_metadata, output)
+        # Condition for DecodeOnly: Pure decoding phase where each request generates one token
+        elif state == AscendAttentionState.DecodeOnly:
+            output = self.forward_paged_attention(query, attn_metadata, output)
+        # Condition for ChunkedPrefill:
+        # 1. During speculative decoding scenarios (except mtp)
+        # 2. Processing large prefill requests in chunks
+        # Condition for PrefillCacheHit: Indicates prefill with some cached tokens already processed
+        elif state in [AscendAttentionState.ChunkedPrefill, AscendAttentionState.PrefillCacheHit]:
+            output = self.forward_chunked_prefill_310(query, attn_metadata, output)
+        # Condition for SpecDecoding: Specified for mtp, which is not supported yet.
+        else:
+            raise NotImplementedError(f"AscendAttentionState: {state} is not supported for 310P currently.")
+        return output