Add Worker Interface:check_health (#6681)

This pull request introduces a new capability to monitor the health of
NPU cards directly from the Worker class. This enhancement allows for
proactive detection of NPU issues by executing the npu-smi command,
improving system reliability and operational visibility within the
vllm_ascend worker environment.

- vLLM version: v0.15.0
- vLLM main:
13397841ab

---------

Signed-off-by: liziyu <liziyu16@huawei.com>
Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
Signed-off-by: luomin2005 <luomin2005@huawei.com>
Co-authored-by: liziyu <56102866+liziyu179@users.noreply.github.com>
Co-authored-by: wangxiaoteng <wangxiaoteng@huawei.com>
This commit is contained in:
luomin2005
2026-02-11 15:24:48 +08:00
committed by GitHub
parent 02886e2641
commit 0c1cfa2bac

View File

@@ -571,3 +571,38 @@ class NPUWorker(WorkerBase):
def take_draft_token_ids(self) -> DraftTokenIds | None:
return self.model_runner.take_draft_token_ids()
def check_health(self) -> None:
import subprocess
logger.info("check_health Start!")
try:
result = subprocess.run(
["npu-smi", "info", "-i", str(self.local_rank), "-t", "health"],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0:
parse_text_output(result.stdout)
logger.info("check_health success!")
else:
logger.info(f"query NPU card {self.local_rank} fail: {result.stderr}")
except subprocess.TimeoutExpired:
logger.info(f"query NPU card {self.local_rank} timeout.")
except FileNotFoundError:
logger.info("npu-smi tool not found.")
except Exception as e:
logger.info(f"query NPU card {self.local_rank} fail: {e}")
return
def parse_text_output(output) -> None:
lines = output.strip().split("\n")
for i, line in enumerate(lines):
line = line.strip()
if "Health" in line:
if line.split(":")[-1].strip() != "OK":
raise RuntimeError("NPU card health status is not OK")
return