[bugfix] Fix mooncake kvpool accuracy issue (#4976)

### What this PR does / why we need it? The current KVPool has a accuracy issue https://github.com/vllm-project/vllm-ascend/issues/4412. This PR aims to fix the precision problem without impacting prefill performance. Note：Due to a bug in ADXL, calling `current_event.synchronize()` may occasionally hang. This issue will be fixed in Cann version 8.5.rc1. You can manually build the master branch of the project at https://gitcode.com/cann/hixl to resolve this issue before the 8.5.RC1 release. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: LCAIZJ <leichao139636@163.com>
2025-12-16 11:33:16 +08:00
parent 9e24bdd44c
commit 9c02fa9867
3 changed files with 38 additions and 2 deletions
--- a/vllm_ascend/distributed/kvpool/config_data.py
+++ b/vllm_ascend/distributed/kvpool/config_data.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from typing import Iterable, List, Optional, Tuple, Union

+import torch
 from vllm.distributed.kv_transfer.kv_connector.v1.base import \
    KVConnectorMetadata
 from vllm.logger import logger
@@ -284,6 +285,8 @@ class ReqMeta:

    is_last_chunk: Optional[bool] = None

+    current_event: Optional[torch.npu.Event] = None
+
    @staticmethod
    def from_request_tracker(
        tracker: RequestTracker,
@@ -375,3 +378,4 @@ class LasyerMultiBlockReqMeta:
    block_ids: list[int]
    layer_id: int
    is_last_chunk: Optional[bool] = True
+    current_event: Optional[torch.npu.Event] = None