Launch dp ranks in parallel (#2053)

Co-authored-by: Haotian Liu <6631389+haotian-liu@users.noreply.github.com>
This commit is contained in:
Lianmin Zheng
2024-11-16 17:13:36 -08:00
parent edad373135
commit f719d9aebc
5 changed files with 63 additions and 28 deletions

View File

@@ -794,6 +794,15 @@ def add_prometheus_middleware(app):
app.routes.append(metrics_route)
def bind_port(port):
"""Bind to a specific port, assuming it's available."""
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # Allows address reuse
sock.bind(("", port))
sock.listen(1)
return sock
def get_amdgpu_memory_capacity():
try:
# Run rocm-smi and capture the output