From 90a75a90a9adc8be8efa294bada8391bb1607e0d Mon Sep 17 00:00:00 2001
From: linfeng-yuan <1102311262@qq.com>
Date: Wed, 3 Sep 2025 17:56:12 +0800
Subject: [PATCH] [bugfix] fix torchair runtime error caused by configuration
 mismtaches and file missing (#2532)

### What this PR does / why we need it?
This PR ports #2312 #2506 #2531 to main branch.

Original implementation of torchair caching forces users to make
everything prepared, fix all the configuration and enable
`use_cached_npu_graph`, and it might cause some problems confusing to
understand and tackle for users. It is better to compile the graph twice
instead of reusing the old kvcaches and cached torchair graph. And the
extra duration time is acceptable. Additionally, this pr fixes a
recompilation problem of torchair graph mode caused by
`running_in_graph` variable in `AscendMLATorchairImpl`.

### Does this PR introduce _any_ user-facing change?
If users want to enabling torchair.cache_compile with high compilation
speed, it is recommended to enable both `use_cached_kv_cache_bytes` and
`use_cached_graph` in `torchair_graph_config`. Without
`use_cached_kv_cache_bytes`, we'll compile torchair computation graph
twice to avoid runtime error caused by configuration mismtaches (the
second compilation will be much faster). Additionally, we've made a
change to how the TORCHAIR_CACHE_HOME enviroment variable is utilized to
enhance safety and prevent accidental file deletion by adding a suffix
directory.

### How was this patch tested?
CI and e2e vllm serving pass.


- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/70549c1245c3eeb3706e3c09a9e18d702fbf705f

---------

Signed-off-by: linfeng-yuan <1102311262@qq.com>
---
 tests/ut/test_ascend_config.py                | 34 +++++++++++++++++++
 tests/ut/torchair/test_utils.py               | 10 ++++++
 vllm_ascend/ascend_config.py                  | 10 ++++++
 vllm_ascend/platform.py                       | 14 ++++++++
 vllm_ascend/torchair/torchair_mla.py          |  1 +
 vllm_ascend/torchair/torchair_model_runner.py | 23 +++++++++----
 vllm_ascend/torchair/torchair_worker.py       |  6 ++--
 vllm_ascend/torchair/utils.py                 |  8 +++--
 vllm_ascend/worker/mtp_proposer_v1.py         |  4 ++-
 9 files changed, 97 insertions(+), 13 deletions(-)

diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py
index ad252b4..4c7cfa6 100644
--- a/tests/ut/test_ascend_config.py
+++ b/tests/ut/test_ascend_config.py
@@ -262,6 +262,40 @@ class TestAscendConfig(TestBase):
             }
             init_ascend_config(test_vllm_config)
 
+        # use_cached_kv_cache_bytes should not be enabled without torchair graph mode
+        with self.assertRaises(RuntimeError):
+            test_vllm_config.additional_config = {
+                "torchair_graph_config": {
+                    "enabled": False,
+                    "use_cached_kv_cache_bytes": True,
+                },
+                "refresh": True
+            }
+            init_ascend_config(test_vllm_config)
+
+        # graph_batch_sizes should not be set without torchair graph mode
+        with self.assertRaises(RuntimeError):
+            test_vllm_config.additional_config = {
+                "torchair_graph_config": {
+                    "enabled": False,
+                    "graph_batch_sizes": [1, 2, 4],
+                },
+                "refresh": True
+            }
+            init_ascend_config(test_vllm_config)
+
+        # use_cached_kv_cache_bytes is valid only when torchair graph mode and use_cached_graph are enabled
+        with self.assertRaises(RuntimeError):
+            test_vllm_config.additional_config = {
+                "torchair_graph_config": {
+                    "enabled": True,
+                    "use_cached_graph": False,
+                    "use_cached_kv_cache_bytes": True,
+                },
+                "refresh": True
+            }
+            init_ascend_config(test_vllm_config)
+
         # graph_batch_sizes_init should not be enabled without torchair graph mode
         with self.assertRaises(RuntimeError):
             test_vllm_config.additional_config = {
diff --git a/tests/ut/torchair/test_utils.py b/tests/ut/torchair/test_utils.py
index 8aebb9d..b68bc31 100644
--- a/tests/ut/torchair/test_utils.py
+++ b/tests/ut/torchair/test_utils.py
@@ -49,6 +49,16 @@ class TestTorchairUtils(TestBase):
         self.assertFalse(utils.check_kv_cache_bytes_cache_exist(),
                          "Delete kv cache bytes cache dir failed")
 
+    def test_delete_torchair_cache_file_multiple_times(self):
+        utils.write_kv_cache_bytes_to_file(0, 100)
+        utils.delete_torchair_cache_file()
+        for i in range(5):
+            try:
+                utils.delete_torchair_cache_file()
+            except FileNotFoundError:
+                self.fail(
+                    f"Unexpected FileNotFoundError on delete call #{i+2}")
+
     @patch('vllm.ModelRegistry')
     def test_register_torchair_model(self, mock_model_registry):
         mock_registry = MagicMock()
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
index 597ff5b..e46cd9a 100644
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -73,6 +73,8 @@ class TorchairGraphConfig:
         self.mode = torchair_graph_config.get("mode", '')
         self.use_cached_graph = torchair_graph_config.get(
             "use_cached_graph", False)
+        self.use_cached_kv_cache_bytes = torchair_graph_config.get(
+            "use_cached_kv_cache_bytes", False)
         self.graph_batch_sizes = torchair_graph_config.get(
             "graph_batch_sizes", [])
         self.graph_batch_sizes_init = torchair_graph_config.get(
@@ -99,6 +101,10 @@ class TorchairGraphConfig:
                 raise RuntimeError(
                     "use_cached_graph is valid only when Torchair graph mode is enabled"
                 )
+            if self.use_cached_kv_cache_bytes:
+                raise RuntimeError(
+                    "use_cached_kv_cache_bytes is valid only when Torchair graph mode is enabled"
+                )
             if self.graph_batch_sizes:
                 raise RuntimeError(
                     "graph_batch_sizes is valid only when Torchair graph mode is enabled"
@@ -119,6 +125,10 @@ class TorchairGraphConfig:
                 raise RuntimeError(
                     "enable_kv_nz is valid only when Torchair graph mode is enabled"
                 )
+        if self.use_cached_kv_cache_bytes and not self.use_cached_graph:
+            raise RuntimeError(
+                "use_cached_kv_cache_bytes is valid only when Torchair graph mode and use_cached_graph are enabled"
+            )
 
 
 class AscendSchedulerConfig:
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index afc0e6b..57ace2b 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -28,6 +28,8 @@ from vllm.platforms import Platform, PlatformEnum
 
 from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config,
                                        init_ascend_config)
+from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
+                                        delete_torchair_cache_file)
 from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, is_310p,
                                update_aclgraph_sizes)
 
@@ -170,6 +172,18 @@ class NPUPlatform(Platform):
                 "Torchair compilation enabled on NPU. Setting CUDAGraphMode to NONE"
             )
             compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+            # Note: We delete the torchair cache folder here to prevent runtime issues caused by dimension
+            # mismatches or configuration inconsistencies when users reuse cached computation graphs. Though
+            # this will increase graph compilation duration, it significantly enhances robustness and decreases
+            # graph launching time during inference.
+            if check_torchair_cache_exist(
+            ) and not ascend_config.torchair_graph_config.use_cached_kv_cache_bytes:
+                logger.warning(
+                    "Torchair cache folder is deleted here to prevent runtime issues caused by dimension "
+                    "mismatches or configuration inconsistencies when users reuse cached computation graphs. "
+                    "In order to decrease torchair graph compilation time, users can enable both use_cached_graph "
+                    "and use_cached_kv_cache_bytes in torchair_graph_config.")
+                delete_torchair_cache_file()
 
         if parallel_config.distributed_executor_backend == "ray":
             logger.warning(
diff --git a/vllm_ascend/torchair/torchair_mla.py b/vllm_ascend/torchair/torchair_mla.py
index 036db47..30ef293 100644
--- a/vllm_ascend/torchair/torchair_mla.py
+++ b/vllm_ascend/torchair/torchair_mla.py
@@ -628,6 +628,7 @@ class AscendMLATorchairImpl(MLAAttentionImpl):
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
         self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz
         self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
+        self.running_in_graph = False
 
         # Adapt torch air graph mode with spec decoding.
         speculative_config = get_current_vllm_config().speculative_config
diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py
index 7d2f605..2b34f9b 100644
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -35,8 +35,9 @@ import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.torchair.utils import (
-    TorchairCommonAttentionMetadata, check_torchair_cache_exist,
-    converting_weight_acl_format, register_torchair_model, torchair_ops_patch,
+    TORCHAIR_CACHE_DIR, TorchairCommonAttentionMetadata,
+    check_torchair_cache_exist, converting_weight_acl_format,
+    register_torchair_model, torchair_ops_patch,
     torchair_quant_method_register, write_kv_cache_bytes_to_file)
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
                                is_310p)
@@ -52,6 +53,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
         self.torchair_compiled_model = None  # type: ignore
         self.torchair_compiled_models = {}  # type: ignore
         self.use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph
+        self.use_cached_kv_cache_bytes = ascend_config.torchair_graph_config.use_cached_kv_cache_bytes
         self.torchair_graph_batch_sizes = ascend_config.torchair_graph_config.graph_batch_sizes
         if ascend_config.torchair_graph_config.graph_batch_sizes_init:
             self.init_torchair_graph_batch_sizes()
@@ -194,14 +196,20 @@ class NPUTorchairModelRunner(NPUModelRunner):
         graph_num = len(torchair_graph_batch_sizes)
 
         if self.use_cached_npu_graph and not check_torchair_cache_exist():
-            # If caching is enabled but does not exist, we will compile the model twice. The first
-            # time is used to generate the cache, and the second time is used to load the cache to
-            # skip the overhead caused by Dynamo guard mechanism.
+            # If caching is enabled but does not exist (either
+            # use_cached_kv_cache_bytes is disabled or kv_cache_bytes are
+            # different), we will compile the model twice. The first time is
+            # used to generate the cache, and the second time is used to load the
+            # cache to skip the overhead caused by Dynamo guard mechanism.
             logger.info(
-                "Use cached npu graph but cache doesn't exist! Now we compile graph to genetate torchair cache, this usually takes %.1f~%.1f mins.",
+                "Cache compilation for torchair graph is enabled. Now we compile graph to genetate"
+                " torchair cache, this usually takes %.1f~%.1f mins.",
                 0.5 * graph_num, 1.5 * graph_num)
             self._compile_torchair_graph(torchair_graph_batch_sizes)
             NPUPlatform.synchronize()
+            # Note: We reset dynamo and reload the compiled torchair cached computation graph below
+            # that was compiled above. This operation reduces graph launch time by 2-4ms and avoids
+            # runtime errors caused by configuration mismatches in graph mode.
             torch._dynamo.reset()
             self.torchair_compiled_models.clear()
         if self.use_cached_npu_graph:
@@ -215,7 +223,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
                 0.5 * graph_num, 1.5 * graph_num)
             self._compile_torchair_graph(torchair_graph_batch_sizes)
 
-        if self.new_kv_cache_bytes > 0:
+        if self.use_cached_kv_cache_bytes and self.new_kv_cache_bytes > 0:
             write_kv_cache_bytes_to_file(torch.distributed.get_rank(),
                                          self.new_kv_cache_bytes)
 
@@ -362,6 +370,7 @@ class NPUTorchairModelRunner(NPUModelRunner):
                     self.model.__dict__[forward_proxy_name],
                     dynamic=True,
                     fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                    cache_dir=TORCHAIR_CACHE_DIR,
                     config=config,
                     ge_cache=False)
             return self.torchair_compiled_models[batch_size]
diff --git a/vllm_ascend/torchair/torchair_worker.py b/vllm_ascend/torchair/torchair_worker.py
index 3488ac7..85f2fb4 100644
--- a/vllm_ascend/torchair/torchair_worker.py
+++ b/vllm_ascend/torchair/torchair_worker.py
@@ -17,9 +17,9 @@ import torch
 from vllm.logger import logger
 
 import vllm_ascend.envs as envs_ascend
+from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.torchair.torchair_model_runner import NPUTorchairModelRunner
 from vllm_ascend.torchair.utils import (check_kv_cache_bytes_cache_exist,
-                                        check_torchair_cache_exist,
                                         delete_torchair_cache_file,
                                         read_kv_cache_bytes_from_file)
 from vllm_ascend.worker.worker_v1 import NPUWorker
@@ -33,7 +33,9 @@ class NPUTorchairWorker(NPUWorker):
 
         available_kv_cache_memory = super().determine_available_memory()
 
-        if check_torchair_cache_exist() and check_kv_cache_bytes_cache_exist():
+        if get_ascend_config(
+        ).torchair_graph_config.use_cached_kv_cache_bytes and check_kv_cache_bytes_cache_exist(
+        ):
             old_kv_cache_bytes = read_kv_cache_bytes_from_file(
                 torch.distributed.get_rank())
             if 0 < old_kv_cache_bytes <= available_kv_cache_memory:
diff --git a/vllm_ascend/torchair/utils.py b/vllm_ascend/torchair/utils.py
index 8dd1e3f..563fda7 100644
--- a/vllm_ascend/torchair/utils.py
+++ b/vllm_ascend/torchair/utils.py
@@ -18,8 +18,8 @@ except ImportError:
 KV_CACHE_BYTES_CACHE_PATH_NAME = ".kv_cache_bytes"
 KV_CACHE_BYTES_CACHE_FILE_NAME = "kv_cache_bytes"
 TORCHAIR_CACHE_PATH_NAME = ".torchair_cache"
-TORCHAIR_CACHE_DIR = os.getenv(
-    'TORCHAIR_CACHE_HOME', os.path.join(os.getcwd(), TORCHAIR_CACHE_PATH_NAME))
+TORCHAIR_CACHE_DIR = os.path.join(
+    os.getenv('TORCHAIR_CACHE_HOME', os.getcwd()), TORCHAIR_CACHE_PATH_NAME)
 
 
 @dataclass
@@ -111,8 +111,10 @@ def write_kv_cache_bytes_to_file(rank, kv_cache_bytes):
 
 def delete_torchair_cache_file():
     torch_air_abs_path = _get_torchair_current_work_dir()
-    if os.path.exists(torch_air_abs_path):
+    try:
         shutil.rmtree(torch_air_abs_path)
+    except FileNotFoundError:
+        pass
 
 
 def npu_stream_switch(tag: str, priority: int, *, enabled: bool = True):
diff --git a/vllm_ascend/worker/mtp_proposer_v1.py b/vllm_ascend/worker/mtp_proposer_v1.py
index e5d555b..e8f369f 100644
--- a/vllm_ascend/worker/mtp_proposer_v1.py
+++ b/vllm_ascend/worker/mtp_proposer_v1.py
@@ -20,7 +20,8 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.models.deepseek_mtp import CustomDeepSeekMTP
 from vllm_ascend.torchair.models.torchair_deepseek_mtp import \
     TorchairDeepSeekMTP
-from vllm_ascend.torchair.utils import TorchairCommonAttentionMetadata
+from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR,
+                                        TorchairCommonAttentionMetadata)
 from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable
 
 
@@ -411,6 +412,7 @@ class MtpProposer:
                     self.model.__dict__[forward_proxy_name],
                     dynamic=True,
                     fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                    cache_dir=TORCHAIR_CACHE_DIR,
                     config=config,
                     ge_cache=False)
             return self.torchair_compiled_models[batch_size]