Add Worker Interface:check_health (#6681)
This pull request introduces a new capability to monitor the health of
NPU cards directly from the Worker class. This enhancement allows for
proactive detection of NPU issues by executing the npu-smi command,
improving system reliability and operational visibility within the
vllm_ascend worker environment.
- vLLM version: v0.15.0
- vLLM main:
13397841ab
---------
Signed-off-by: liziyu <liziyu16@huawei.com>
Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
Signed-off-by: luomin2005 <luomin2005@huawei.com>
Co-authored-by: liziyu <56102866+liziyu179@users.noreply.github.com>
Co-authored-by: wangxiaoteng <wangxiaoteng@huawei.com>
This commit is contained in:
@@ -571,3 +571,38 @@ class NPUWorker(WorkerBase):
|
||||
|
||||
def take_draft_token_ids(self) -> DraftTokenIds | None:
|
||||
return self.model_runner.take_draft_token_ids()
|
||||
|
||||
def check_health(self) -> None:
|
||||
import subprocess
|
||||
|
||||
logger.info("check_health Start!")
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["npu-smi", "info", "-i", str(self.local_rank), "-t", "health"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
parse_text_output(result.stdout)
|
||||
logger.info("check_health success!")
|
||||
else:
|
||||
logger.info(f"query NPU card {self.local_rank} fail: {result.stderr}")
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.info(f"query NPU card {self.local_rank} timeout.")
|
||||
except FileNotFoundError:
|
||||
logger.info("npu-smi tool not found.")
|
||||
except Exception as e:
|
||||
logger.info(f"query NPU card {self.local_rank} fail: {e}")
|
||||
return
|
||||
|
||||
|
||||
def parse_text_output(output) -> None:
|
||||
lines = output.strip().split("\n")
|
||||
for i, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
if "Health" in line:
|
||||
if line.split(":")[-1].strip() != "OK":
|
||||
raise RuntimeError("NPU card health status is not OK")
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user