[Feature] Simple Improve Health Check Mechanism for Production-Grade Stability (#8115)

Signed-off-by: ybyang <ybyang7@iflytek.com>
This commit is contained in:
ybyang
2025-07-20 09:10:00 +08:00
committed by GitHub
parent abda2542d5
commit 4540a4666a
6 changed files with 82 additions and 11 deletions

View File

@@ -93,6 +93,22 @@ time_infos = {}
HIP_FP8_E4M3_FNUZ_MAX = 224.0
class ServerStatus(Enum):
Up = "Up"
Starting = "Starting"
UnHealthy = "UnHealthy"
Crashed = "Crashed"
def is_healthy(self) -> bool:
return self == ServerStatus.Up
def report_health(status: ServerStatus, host: str, http_port: int, msg: str = ""):
requests.post(
f"http://{host}:{http_port}/health", json={"status": status.value, "msg": msg}
)
# https://pytorch.org/docs/stable/notes/hip.html#checking-for-hip
def is_hip() -> bool:
return torch.version.hip is not None