[bugfix] fix torchair runtime error caused by configuration mismtaches and file missing (#2532)

### What this PR does / why we need it? This PR ports #2312 #2506 #2531 to main branch. Original implementation of torchair caching forces users to make everything prepared, fix all the configuration and enable `use_cached_npu_graph`, and it might cause some problems confusing to understand and tackle for users. It is better to compile the graph twice instead of reusing the old kvcaches and cached torchair graph. And the extra duration time is acceptable. Additionally, this pr fixes a recompilation problem of torchair graph mode caused by `running_in_graph` variable in `AscendMLATorchairImpl`. ### Does this PR introduce _any_ user-facing change? If users want to enabling torchair.cache_compile with high compilation speed, it is recommended to enable both `use_cached_kv_cache_bytes` and `use_cached_graph` in `torchair_graph_config`. Without `use_cached_kv_cache_bytes`, we'll compile torchair computation graph twice to avoid runtime error caused by configuration mismtaches (the second compilation will be much faster). Additionally, we've made a change to how the TORCHAIR_CACHE_HOME enviroment variable is utilized to enhance safety and prevent accidental file deletion by adding a suffix directory. ### How was this patch tested? CI and e2e vllm serving pass. - vLLM version: v0.10.1.1 - vLLM main: 70549c1245 --------- Signed-off-by: linfeng-yuan <1102311262@qq.com>
2025-09-03 17:56:12 +08:00
parent 5889fa1b1c
commit 90a75a90a9
9 changed files with 97 additions and 13 deletions
--- a/tests/ut/test_ascend_config.py
+++ b/tests/ut/test_ascend_config.py
@@ -262,6 +262,40 @@ class TestAscendConfig(TestBase):
            }
            init_ascend_config(test_vllm_config)

+        # use_cached_kv_cache_bytes should not be enabled without torchair graph mode
+        with self.assertRaises(RuntimeError):
+            test_vllm_config.additional_config = {
+                "torchair_graph_config": {
+                    "enabled": False,
+                    "use_cached_kv_cache_bytes": True,
+                },
+                "refresh": True
+            }
+            init_ascend_config(test_vllm_config)
+
+        # graph_batch_sizes should not be set without torchair graph mode
+        with self.assertRaises(RuntimeError):
+            test_vllm_config.additional_config = {
+                "torchair_graph_config": {
+                    "enabled": False,
+                    "graph_batch_sizes": [1, 2, 4],
+                },
+                "refresh": True
+            }
+            init_ascend_config(test_vllm_config)
+
+        # use_cached_kv_cache_bytes is valid only when torchair graph mode and use_cached_graph are enabled
+        with self.assertRaises(RuntimeError):
+            test_vllm_config.additional_config = {
+                "torchair_graph_config": {
+                    "enabled": True,
+                    "use_cached_graph": False,
+                    "use_cached_kv_cache_bytes": True,
+                },
+                "refresh": True
+            }
+            init_ascend_config(test_vllm_config)
+
        # graph_batch_sizes_init should not be enabled without torchair graph mode
        with self.assertRaises(RuntimeError):
            test_vllm_config.additional_config = {
--- a/tests/ut/torchair/test_utils.py
+++ b/tests/ut/torchair/test_utils.py
@@ -49,6 +49,16 @@ class TestTorchairUtils(TestBase):
        self.assertFalse(utils.check_kv_cache_bytes_cache_exist(),
                         "Delete kv cache bytes cache dir failed")

+    def test_delete_torchair_cache_file_multiple_times(self):
+        utils.write_kv_cache_bytes_to_file(0, 100)
+        utils.delete_torchair_cache_file()
+        for i in range(5):
+            try:
+                utils.delete_torchair_cache_file()
+            except FileNotFoundError:
+                self.fail(
+                    f"Unexpected FileNotFoundError on delete call #{i+2}")
+
    @patch('vllm.ModelRegistry')
    def test_register_torchair_model(self, mock_model_registry):
        mock_registry = MagicMock()