feat: add kv cache memory cache and skip dynamo guard (#1549)

### What this PR does / why we need it? 1、Sometimes loading torchair cache will fail because of the floating of npu memory, so this pr add a new cache to save the old kv cache bytes to avoid the possible crash while loading the torchair graph cache. 2、When caching is enabled and does not exist, the first compilation introduces the overhead of Dynamo Gurad. So in this case, we will compile them directly twice to skip them (This will bring 3-4 ms of tpot optimization) ### Does this PR introduce _any_ user-facing change? Add a new env `VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE` to control kv cache floating tolerance ### How was this patch tested? - vLLM version: v0.9.1 - vLLM main: 1fd471e957 Signed-off-by: boying <897013703@qq.com>
2025-07-07 22:37:14 +08:00
parent df84cceca8
commit 71de52d3a9
5 changed files with 182 additions and 24 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -18,7 +18,10 @@
 #

 import atexit
+import fcntl
 import math
+import os
+import shutil
 from contextlib import contextmanager, nullcontext
 from enum import Enum
 from threading import Lock
@@ -440,3 +443,77 @@ def get_fused_moe_state(ep_size: int, with_prefill: bool,
        return FusedMoEState.All2All
    else:
        return FusedMoEState.MC2
+
+
+KV_CACHE_BYTES_CACHE_PATH_NAME = ".kv_cache_bytes"
+KV_CACHE_BYTES_CACHE_FILE_NAME = "kv_cache_bytes"
+TORCHAIR_CACHE_PATH_NAME = ".torchair_cache"
+TORCHAIR_CACHE_DIR = os.getenv(
+    'TORCHAIR_CACHE_HOME', os.path.join(os.getcwd(), TORCHAIR_CACHE_PATH_NAME))
+
+
+def get_torchair_current_work_dir(file_name=None):
+    if file_name is None:
+        return TORCHAIR_CACHE_DIR
+    return os.path.join(TORCHAIR_CACHE_DIR, file_name)
+
+
+def check_torchair_cache_exist():
+    res = False
+    torch_air_abs_path = get_torchair_current_work_dir()
+    if os.path.exists(torch_air_abs_path):
+        file_list = os.listdir(torch_air_abs_path)
+        if len(file_list) != 0:
+            res = True
+    return res
+
+
+def check_kv_cache_bytes_cache_exist():
+    res = False
+    kv_cache_bytes_cache_abs_path = get_torchair_current_work_dir(
+        KV_CACHE_BYTES_CACHE_PATH_NAME)
+    if os.path.exists(kv_cache_bytes_cache_abs_path):
+        file_list = os.listdir(kv_cache_bytes_cache_abs_path)
+        if len(file_list) != 0:
+            res = True
+    return res
+
+
+def read_kv_cache_bytes_from_file(rank) -> int:
+    kv_cache_bytes = -1
+    kv_cache_bytes_cache_abs_path = get_torchair_current_work_dir(
+        KV_CACHE_BYTES_CACHE_PATH_NAME)
+    kv_cache_bytes_file = os.path.join(
+        kv_cache_bytes_cache_abs_path,
+        f"{rank}_{KV_CACHE_BYTES_CACHE_FILE_NAME}")
+    with open(kv_cache_bytes_file, "r", encoding="utf-8") as f:
+        with file_lock(f, fcntl.LOCK_SH):
+            kv_cache_bytes = int(f.readline())
+    return kv_cache_bytes
+
+
+@contextmanager
+def file_lock(file_descriptor, lock_type):
+    fcntl.flock(file_descriptor, lock_type)
+    try:
+        yield
+    finally:
+        fcntl.flock(file_descriptor, fcntl.LOCK_UN)
+
+
+def write_kv_cache_bytes_to_file(rank, kv_cache_bytes):
+    kv_cache_bytes_cache_abs_path = get_torchair_current_work_dir(
+        KV_CACHE_BYTES_CACHE_PATH_NAME)
+    os.makedirs(kv_cache_bytes_cache_abs_path, exist_ok=True)
+    kv_cache_bytes_file = os.path.join(
+        kv_cache_bytes_cache_abs_path,
+        f"{rank}_{KV_CACHE_BYTES_CACHE_FILE_NAME}")
+    with open(kv_cache_bytes_file, "w", encoding="utf-8") as f:
+        with file_lock(f, fcntl.LOCK_EX):
+            f.write(f"{kv_cache_bytes}")
+
+
+def delete_torchair_cache_file():
+    torch_air_abs_path = get_torchair_current_work_dir()
+    if os.path.exists(torch_air_abs_path):
+        shutil.rmtree(torch_air_abs_path)