[Bugfix][APC] Fix accuracy issue on prefix caching with AscendScheduler (#2714)
### What this PR does / why we need it?
Fix accuracy issue on prefix caching with AscendScheduler
### How was this patch tested?
CI passed with `test_prefix_cache_with_ascend_scheduler`
- vLLM version: v0.10.1.1
- vLLM main:
6997a25ac6
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
2
.github/workflows/vllm_ascend_test.yaml
vendored
2
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -291,6 +291,6 @@ jobs:
|
||||
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
|
||||
|
||||
#pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
|
||||
#pytest -sv tests/e2e/multicard/test_prefix_caching.py
|
||||
pytest -sv tests/e2e/multicard/test_prefix_caching.py
|
||||
pytest -sv tests/e2e/multicard/test_qwen3_moe.py
|
||||
pytest -sv tests/e2e/multicard/test_torchair_graph_mode.py
|
||||
|
||||
@@ -67,7 +67,7 @@ class AscendScheduler(Scheduler):
|
||||
preempted_reqs: list[Request] = []
|
||||
|
||||
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
|
||||
req_to_new_block_ids: dict[str, list[int]] = {}
|
||||
req_to_new_block_ids: dict[str, list[list[int]]] = {}
|
||||
else:
|
||||
req_to_new_blocks: dict[str, KVCacheBlocks] = {}
|
||||
num_scheduled_tokens: dict[str, int] = {}
|
||||
@@ -231,7 +231,9 @@ class AscendScheduler(Scheduler):
|
||||
req_to_new_block_ids[request.request_id] = (
|
||||
self.kv_cache_manager.get_block_ids(request.request_id))
|
||||
else:
|
||||
req_to_new_blocks[request.request_id] = new_blocks
|
||||
req_to_new_blocks[
|
||||
request.request_id] = self.kv_cache_manager.get_blocks(
|
||||
request.request_id)
|
||||
# Update request info.
|
||||
num_scheduled_tokens[request.request_id] = num_new_tokens
|
||||
token_budget -= num_new_tokens
|
||||
@@ -354,7 +356,8 @@ class AscendScheduler(Scheduler):
|
||||
|
||||
# Get the longest common prefix among all requests in the running queue.
|
||||
# This can be potentially used for cascade attention.
|
||||
num_common_prefix_blocks = 0
|
||||
num_common_prefix_blocks = [0] * len(
|
||||
self.kv_cache_config.kv_cache_groups)
|
||||
if self.running:
|
||||
any_request = self.running[0]
|
||||
num_common_prefix_blocks = (
|
||||
|
||||
Reference in New Issue
Block a user