[Core] Support the features of prefix cache and chunked prefill in v0/v1 (#782)

### What this PR does / why we need it? Support the features of prefix cache and chunked prefill in v0/v1. --------- Signed-off-by: rjg-lyh <1318825571@qq.com>
2025-05-09 16:39:28 +08:00
parent 324f819b92
commit fa99f89e93
6 changed files with 156 additions and 32 deletions
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -175,11 +175,11 @@ class NPUPlatform(Platform):
        if cache_config:
            if cache_config.block_size is None:
                cache_config.block_size = 128
-            if envs.VLLM_USE_V1 and cache_config.enable_prefix_caching:
+            if cache_config.enable_prefix_caching and cache_config.block_size != 128:
                logger.warning(
-                    "Prefix caching is not supported for V1 now, disable prefix caching"
+                    "If prefix caching is enabled, block size must be set to 128."
                )
-                cache_config.enable_prefix_caching = False
+                cache_config.block_size = 128

        if envs.VLLM_USE_V1:
            # Activate custom ops for v1.