[bugfix][npugraph_ex]fix static kernel uninstall issue (#6128)

### What this PR does / why we need it?

The static kernel in torch_npu is uninstalled through Python's atexit
mechanism.
However, in vllm-ascend, when inference ends or the service stops, the
worker process is terminated. This way, ending the process does not
trigger the atexit mechanism, causing the static kernel not to be
unloaded.
When using the nougraph_ex backend and enabling the static kernel, we
registered a signal handler to explicitly unload the static kernel.
When there are many static kernels, unloading usually takes some time,
whereas vllm will directly kill the process after sending a terminate
event. Therefore, we choose to handle it by starting a new process.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
d68209402d

---------

Signed-off-by: chencangtao <chencangtao@huawei.com>
Co-authored-by: chencangtao <chencangtao@huawei.com>
This commit is contained in:
ChenCangtao
2026-01-26 15:03:18 +08:00
committed by GitHub
parent f910cebe04
commit 1645546661

View File

@@ -131,6 +131,57 @@ class NPUWorker(WorkerBase):
self.use_v2_model_runner = envs_vllm.VLLM_USE_V2_MODEL_RUNNER
npugraph_ex_config = get_ascend_config().npugraph_ex_config
if npugraph_ex_config.enable and npugraph_ex_config.enable_static_kernel:
# Prevent duplicate triggers, execute the exit logic only once
shutdown_request = False
def signal_handler(signum, frame):
nonlocal shutdown_request
if not shutdown_request:
shutdown_request = True
self.uninstall_static_kernel()
raise SystemExit()
# Either SIGTERM or SIGINT will terminate the worker
import signal
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
def uninstall_static_kernel(self):
import os
import fcntl
import subprocess
ascend_home_path = os.environ["ASCEND_HOME_PATH"]
static_kernel_dir_path = os.path.join(ascend_home_path, 'opp/static_kernel')
uninstall_script_path = os.path.join(static_kernel_dir_path, 'ai_core/uninstall.sh')
lock_file_path = os.path.join(static_kernel_dir_path, 'uninstall.lock')
if not os.path.exists(uninstall_script_path):
return
with open(lock_file_path, 'w') as lock_fd:
try:
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
subprocess.Popen(
['bash', uninstall_script_path],
stdin=subprocess.DEVNULL,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True
)
except (BlockingIOError, OSError) as e:
return
finally:
try:
fcntl.flock(lock_fd, fcntl.LOCK_UN)
if os.path.exists(lock_file_path):
os.remove(lock_file_path)
except Exception:
return
def sleep(self, level: int = 1) -> None:
free_bytes_before_sleep = torch.npu.mem_get_info()[0]
# Save the buffers before level 2 sleep