From 0c1cfa2baca623ae8822d934c5941aac4efd57c4 Mon Sep 17 00:00:00 2001 From: luomin2005 Date: Wed, 11 Feb 2026 15:24:48 +0800 Subject: [PATCH] Add Worker Interface:check_health (#6681) This pull request introduces a new capability to monitor the health of NPU cards directly from the Worker class. This enhancement allows for proactive detection of NPU issues by executing the npu-smi command, improving system reliability and operational visibility within the vllm_ascend worker environment. - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/13397841ab469cecf1ed425c3f52a9ffc38139b5 --------- Signed-off-by: liziyu Signed-off-by: wangxiaoteng Signed-off-by: luomin2005 Co-authored-by: liziyu <56102866+liziyu179@users.noreply.github.com> Co-authored-by: wangxiaoteng --- vllm_ascend/worker/worker.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index f92d0816..c058e59b 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -571,3 +571,38 @@ class NPUWorker(WorkerBase): def take_draft_token_ids(self) -> DraftTokenIds | None: return self.model_runner.take_draft_token_ids() + + def check_health(self) -> None: + import subprocess + + logger.info("check_health Start!") + try: + result = subprocess.run( + ["npu-smi", "info", "-i", str(self.local_rank), "-t", "health"], + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode == 0: + parse_text_output(result.stdout) + logger.info("check_health success!") + else: + logger.info(f"query NPU card {self.local_rank} fail: {result.stderr}") + except subprocess.TimeoutExpired: + logger.info(f"query NPU card {self.local_rank} timeout.") + except FileNotFoundError: + logger.info("npu-smi tool not found.") + except Exception as e: + logger.info(f"query NPU card {self.local_rank} fail: {e}") + return + + +def parse_text_output(output) -> None: + lines = output.strip().split("\n") + for i, line in enumerate(lines): + line = line.strip() + if "Health" in line: + if line.split(":")[-1].strip() != "OK": + raise RuntimeError("NPU card health status is not OK") + return