[bugfix] Fix mooncake kvpool accuracy issue (#4976)

### What this PR does / why we need it?

The current KVPool has a accuracy issue
https://github.com/vllm-project/vllm-ascend/issues/4412. This PR aims to
fix the precision problem without impacting prefill performance.

Note:Due to a bug in ADXL, calling `current_event.synchronize()` may
occasionally hang. This issue will be fixed in Cann version 8.5.rc1. You
can manually build the master branch of the project at
https://gitcode.com/cann/hixl to resolve this issue before the 8.5.RC1
release.


- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: LCAIZJ <leichao139636@163.com>
This commit is contained in:
Chao Lei
2025-12-16 11:33:16 +08:00
committed by GitHub
parent 9e24bdd44c
commit 9c02fa9867
3 changed files with 38 additions and 2 deletions

View File

@@ -251,12 +251,20 @@ class KVPoolWorker:
connector_metadata: AscendConnectorMetadata) -> None:
if self.current_layer == 0:
self.layerwise_storers = []
current_event = None
for request in connector_metadata.requests:
can_save = request.can_save
if can_save is None or not can_save:
continue
current_event = torch.npu.Event()
current_event.record()
break
for request in connector_metadata.requests:
can_save = request.can_save
if can_save is None or not can_save:
continue
layerwise_storer = self.store_layer(request)
layerwise_storer = self.store_layer(request, current_event)
self.layerwise_storers.append(layerwise_storer)
for layerwise_storer in self.layerwise_storers:
try:
@@ -266,11 +274,21 @@ class KVPoolWorker:
self.current_layer = self.current_layer + 1
def wait_for_save(self, connector_metadata: AscendConnectorMetadata):
current_event = None
for request in connector_metadata.requests:
can_save = request.can_save
if can_save is None or not can_save:
continue
current_event = torch.npu.Event()
current_event.record()
break
for request in connector_metadata.requests:
can_save = request.can_save
if can_save is None or not can_save:
continue
request.current_event = current_event
self.kv_send_thread.add_request( # type: ignore[union-attr]
request, )
@@ -347,6 +365,7 @@ class KVPoolWorker:
def store_layer(
self,
request: ReqMeta,
current_event: Optional[torch.npu.Event],
) -> Generator[None, None, None]:
"""
Store the KV cache in a layerwise manner.
@@ -385,7 +404,8 @@ class KVPoolWorker:
keys_multi_chunk, starts,
ends, request.block_ids,
layer_id,
request.is_last_chunk)
request.is_last_chunk,
current_event)
self.kv_send_thread.add_request( # type: ignore[union-attr, call-arg]
req_meta) # type: ignore[union-attr, call-arg, arg-type]
yield