[bugfix][npugraph_ex]fix static kernel uninstall issue (#6128)
### What this PR does / why we need it?
The static kernel in torch_npu is uninstalled through Python's atexit
mechanism.
However, in vllm-ascend, when inference ends or the service stops, the
worker process is terminated. This way, ending the process does not
trigger the atexit mechanism, causing the static kernel not to be
unloaded.
When using the nougraph_ex backend and enabling the static kernel, we
registered a signal handler to explicitly unload the static kernel.
When there are many static kernels, unloading usually takes some time,
whereas vllm will directly kill the process after sending a terminate
event. Therefore, we choose to handle it by starting a new process.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
d68209402d
---------
Signed-off-by: chencangtao <chencangtao@huawei.com>
Co-authored-by: chencangtao <chencangtao@huawei.com>
This commit is contained in:
@@ -131,6 +131,57 @@ class NPUWorker(WorkerBase):
|
||||
|
||||
self.use_v2_model_runner = envs_vllm.VLLM_USE_V2_MODEL_RUNNER
|
||||
|
||||
npugraph_ex_config = get_ascend_config().npugraph_ex_config
|
||||
if npugraph_ex_config.enable and npugraph_ex_config.enable_static_kernel:
|
||||
# Prevent duplicate triggers, execute the exit logic only once
|
||||
shutdown_request = False
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
nonlocal shutdown_request
|
||||
if not shutdown_request:
|
||||
shutdown_request = True
|
||||
self.uninstall_static_kernel()
|
||||
raise SystemExit()
|
||||
|
||||
# Either SIGTERM or SIGINT will terminate the worker
|
||||
import signal
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
|
||||
def uninstall_static_kernel(self):
|
||||
import os
|
||||
import fcntl
|
||||
import subprocess
|
||||
|
||||
ascend_home_path = os.environ["ASCEND_HOME_PATH"]
|
||||
static_kernel_dir_path = os.path.join(ascend_home_path, 'opp/static_kernel')
|
||||
uninstall_script_path = os.path.join(static_kernel_dir_path, 'ai_core/uninstall.sh')
|
||||
lock_file_path = os.path.join(static_kernel_dir_path, 'uninstall.lock')
|
||||
|
||||
if not os.path.exists(uninstall_script_path):
|
||||
return
|
||||
with open(lock_file_path, 'w') as lock_fd:
|
||||
try:
|
||||
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
subprocess.Popen(
|
||||
['bash', uninstall_script_path],
|
||||
stdin=subprocess.DEVNULL,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
start_new_session=True
|
||||
)
|
||||
except (BlockingIOError, OSError) as e:
|
||||
return
|
||||
finally:
|
||||
try:
|
||||
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
||||
if os.path.exists(lock_file_path):
|
||||
os.remove(lock_file_path)
|
||||
except Exception:
|
||||
return
|
||||
|
||||
|
||||
def sleep(self, level: int = 1) -> None:
|
||||
free_bytes_before_sleep = torch.npu.mem_get_info()[0]
|
||||
# Save the buffers before level 2 sleep
|
||||
|
||||
Reference in New Issue
Block a user