[bugfix][npugraph_ex]fix static kernel uninstall issue (#6128)

### What this PR does / why we need it? The static kernel in torch_npu is uninstalled through Python's atexit mechanism. However, in vllm-ascend, when inference ends or the service stops, the worker process is terminated. This way, ending the process does not trigger the atexit mechanism, causing the static kernel not to be unloaded. When using the nougraph_ex backend and enabling the static kernel, we registered a signal handler to explicitly unload the static kernel. When there are many static kernels, unloading usually takes some time, whereas vllm will directly kill the process after sending a terminate event. Therefore, we choose to handle it by starting a new process. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: d68209402d --------- Signed-off-by: chencangtao <chencangtao@huawei.com> Co-authored-by: chencangtao <chencangtao@huawei.com>
2026-01-26 15:03:18 +08:00
parent f910cebe04
commit 1645546661
1 changed files with 51 additions and 0 deletions
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -131,6 +131,57 @@ class NPUWorker(WorkerBase):

        self.use_v2_model_runner = envs_vllm.VLLM_USE_V2_MODEL_RUNNER

+        npugraph_ex_config = get_ascend_config().npugraph_ex_config
+        if npugraph_ex_config.enable and npugraph_ex_config.enable_static_kernel:
+            # Prevent duplicate triggers, execute the exit logic only once
+            shutdown_request = False
+
+            def signal_handler(signum, frame):
+                nonlocal shutdown_request
+                if not shutdown_request:
+                    shutdown_request = True
+                    self.uninstall_static_kernel()
+                    raise SystemExit()
+
+            # Either SIGTERM or SIGINT will terminate the worker
+            import signal
+            signal.signal(signal.SIGTERM, signal_handler)
+            signal.signal(signal.SIGINT, signal_handler)
+
+
+    def uninstall_static_kernel(self):
+        import os
+        import fcntl
+        import subprocess
+
+        ascend_home_path = os.environ["ASCEND_HOME_PATH"]
+        static_kernel_dir_path = os.path.join(ascend_home_path, 'opp/static_kernel')
+        uninstall_script_path = os.path.join(static_kernel_dir_path, 'ai_core/uninstall.sh')
+        lock_file_path = os.path.join(static_kernel_dir_path, 'uninstall.lock')
+
+        if not os.path.exists(uninstall_script_path):
+            return
+        with open(lock_file_path, 'w') as lock_fd:
+            try:
+                fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+                subprocess.Popen(
+                    ['bash', uninstall_script_path],
+                    stdin=subprocess.DEVNULL,
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.DEVNULL,
+                    start_new_session=True
+                )
+            except (BlockingIOError, OSError) as e:
+                return
+            finally:
+                try:
+                    fcntl.flock(lock_fd, fcntl.LOCK_UN)
+                    if os.path.exists(lock_file_path):
+                        os.remove(lock_file_path)
+                except Exception:
+                    return
+
+
    def sleep(self, level: int = 1) -> None:
        free_bytes_before_sleep = torch.npu.mem_get_info()[0]
        # Save the buffers before level 2 sleep