diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 9e67393..af8d035 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -291,6 +291,6 @@ jobs: pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe #pytest -sv tests/e2e/multicard/test_pipeline_parallel.py - #pytest -sv tests/e2e/multicard/test_prefix_caching.py + pytest -sv tests/e2e/multicard/test_prefix_caching.py pytest -sv tests/e2e/multicard/test_qwen3_moe.py pytest -sv tests/e2e/multicard/test_torchair_graph_mode.py diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index f274d3d..f8c7f49 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -67,7 +67,7 @@ class AscendScheduler(Scheduler): preempted_reqs: list[Request] = [] if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): - req_to_new_block_ids: dict[str, list[int]] = {} + req_to_new_block_ids: dict[str, list[list[int]]] = {} else: req_to_new_blocks: dict[str, KVCacheBlocks] = {} num_scheduled_tokens: dict[str, int] = {} @@ -231,7 +231,9 @@ class AscendScheduler(Scheduler): req_to_new_block_ids[request.request_id] = ( self.kv_cache_manager.get_block_ids(request.request_id)) else: - req_to_new_blocks[request.request_id] = new_blocks + req_to_new_blocks[ + request.request_id] = self.kv_cache_manager.get_blocks( + request.request_id) # Update request info. num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens @@ -354,7 +356,8 @@ class AscendScheduler(Scheduler): # Get the longest common prefix among all requests in the running queue. # This can be potentially used for cascade attention. - num_common_prefix_blocks = 0 + num_common_prefix_blocks = [0] * len( + self.kv_cache_config.kv_cache_groups) if self.running: any_request = self.running[0] num_common_prefix_blocks = (