[bugfix] Fix mooncake kvpool accuracy issue (#4976)

### What this PR does / why we need it? The current KVPool has a accuracy issue https://github.com/vllm-project/vllm-ascend/issues/4412. This PR aims to fix the precision problem without impacting prefill performance. Note：Due to a bug in ADXL, calling `current_event.synchronize()` may occasionally hang. This issue will be fixed in Cann version 8.5.rc1. You can manually build the master branch of the project at https://gitcode.com/cann/hixl to resolve this issue before the 8.5.RC1 release. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: LCAIZJ <leichao139636@163.com>
2025-12-16 11:33:16 +08:00
parent 9e24bdd44c
commit 9c02fa9867
3 changed files with 38 additions and 2 deletions
--- a/vllm_ascend/distributed/kvpool/pool_worker.py
+++ b/vllm_ascend/distributed/kvpool/pool_worker.py
@@ -251,12 +251,20 @@ class KVPoolWorker:
                      connector_metadata: AscendConnectorMetadata) -> None:
        if self.current_layer == 0:
            self.layerwise_storers = []
+            current_event = None
+            for request in connector_metadata.requests:
+                can_save = request.can_save
+                if can_save is None or not can_save:
+                    continue
+                current_event = torch.npu.Event()
+                current_event.record()
+                break
            for request in connector_metadata.requests:
                can_save = request.can_save
                if can_save is None or not can_save:
                    continue

-                layerwise_storer = self.store_layer(request)
+                layerwise_storer = self.store_layer(request, current_event)
                self.layerwise_storers.append(layerwise_storer)
        for layerwise_storer in self.layerwise_storers:
            try:
@@ -266,11 +274,21 @@ class KVPoolWorker:
        self.current_layer = self.current_layer + 1

    def wait_for_save(self, connector_metadata: AscendConnectorMetadata):
+        current_event = None
+        for request in connector_metadata.requests:
+            can_save = request.can_save
+            if can_save is None or not can_save:
+                continue
+            current_event = torch.npu.Event()
+            current_event.record()
+            break
+
        for request in connector_metadata.requests:
            can_save = request.can_save
            if can_save is None or not can_save:
                continue

+            request.current_event = current_event
            self.kv_send_thread.add_request(  # type: ignore[union-attr]
                request, )

@@ -347,6 +365,7 @@ class KVPoolWorker:
    def store_layer(
        self,
        request: ReqMeta,
+        current_event: Optional[torch.npu.Event],
    ) -> Generator[None, None, None]:
        """
        Store the KV cache in a layerwise manner.
@@ -385,7 +404,8 @@ class KVPoolWorker:
                                                   keys_multi_chunk, starts,
                                                   ends, request.block_ids,
                                                   layer_id,
-                                                   request.is_last_chunk)
+                                                   request.is_last_chunk,
+                                                   current_event)
                self.kv_send_thread.add_request(  # type: ignore[union-attr, call-arg]
                    req_meta)  # type: ignore[union-attr, call-arg, arg-type]
                yield