From 984bd7c13a6b7eb80ac9cb43ab85a81afe779614 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Thu, 4 Sep 2025 08:22:46 +0800 Subject: [PATCH] [Bugfix][APC] Fix accuracy issue on prefix caching with AscendScheduler (#2714) ### What this PR does / why we need it? Fix accuracy issue on prefix caching with AscendScheduler ### How was this patch tested? CI passed with `test_prefix_cache_with_ascend_scheduler` - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/6997a25ac65ed6cc3c2be6d09ca45f633a345f63 --------- Signed-off-by: MengqingCao --- .github/workflows/vllm_ascend_test.yaml | 2 +- vllm_ascend/core/scheduler.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 9e67393..af8d035 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -291,6 +291,6 @@ jobs: pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe #pytest -sv tests/e2e/multicard/test_pipeline_parallel.py - #pytest -sv tests/e2e/multicard/test_prefix_caching.py + pytest -sv tests/e2e/multicard/test_prefix_caching.py pytest -sv tests/e2e/multicard/test_qwen3_moe.py pytest -sv tests/e2e/multicard/test_torchair_graph_mode.py diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index f274d3d..f8c7f49 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -67,7 +67,7 @@ class AscendScheduler(Scheduler): preempted_reqs: list[Request] = [] if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): - req_to_new_block_ids: dict[str, list[int]] = {} + req_to_new_block_ids: dict[str, list[list[int]]] = {} else: req_to_new_blocks: dict[str, KVCacheBlocks] = {} num_scheduled_tokens: dict[str, int] = {} @@ -231,7 +231,9 @@ class AscendScheduler(Scheduler): req_to_new_block_ids[request.request_id] = ( self.kv_cache_manager.get_block_ids(request.request_id)) else: - req_to_new_blocks[request.request_id] = new_blocks + req_to_new_blocks[ + request.request_id] = self.kv_cache_manager.get_blocks( + request.request_id) # Update request info. num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens @@ -354,7 +356,8 @@ class AscendScheduler(Scheduler): # Get the longest common prefix among all requests in the running queue. # This can be potentially used for cascade attention. - num_common_prefix_blocks = 0 + num_common_prefix_blocks = [0] * len( + self.kv_cache_config.kv_cache_groups) if self.running: any_request = self.running[0] num_common_prefix_blocks = (