feat: add kv cache memory cache and skip dynamo guard (#1549)

### What this PR does / why we need it?

1、Sometimes loading torchair cache will fail because of the floating of
npu memory, so this pr add a new cache to save the old kv cache bytes to
avoid the possible crash while loading the torchair graph cache.
2、When caching is enabled and does not exist, the first compilation
introduces the overhead of Dynamo Gurad. So in this case, we will
compile them directly twice to skip them (This will bring 3-4 ms of tpot
optimization)

### Does this PR introduce _any_ user-facing change?
Add a new env `VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE` to
control kv cache floating tolerance

### How was this patch tested?

- vLLM version: v0.9.1
- vLLM main:
1fd471e957

Signed-off-by: boying <897013703@qq.com>
This commit is contained in:
NeverRaR
2025-07-07 22:37:14 +08:00
committed by GitHub
parent df84cceca8
commit 71de52d3a9
5 changed files with 182 additions and 24 deletions

View File

@@ -280,6 +280,27 @@ class TestUtils(TestBase):
3,
len(test_vllm_config.compilation_config.cudagraph_capture_sizes))
def test_get_torchair_current_work_dir(self):
cache_dir = utils.TORCHAIR_CACHE_DIR
work_dir = utils.get_torchair_current_work_dir()
self.assertEqual(cache_dir, work_dir)
work_dir = utils.get_torchair_current_work_dir("test")
self.assertEqual(os.path.join(cache_dir, "test"), work_dir)
def test_torchair_cache_dir(self):
utils.write_kv_cache_bytes_to_file(0, 100)
self.assertTrue(utils.check_torchair_cache_exist(),
"Create torchair cache dir failed")
self.assertTrue(utils.check_kv_cache_bytes_cache_exist(),
"Create kv cache bytes cache dir failed")
kv_cache_bytes = utils.read_kv_cache_bytes_from_file(0)
self.assertEqual(100, kv_cache_bytes)
utils.delete_torchair_cache_file()
self.assertFalse(utils.check_torchair_cache_exist(),
"Delete torchair cache dir failed")
self.assertFalse(utils.check_kv_cache_bytes_cache_exist(),
"Delete kv cache bytes cache dir failed")
class TestProfileExecuteDuration(unittest.TestCase):