Improve dp attention port assignment scheme (#5889)

Co-authored-by: Cheng Wan <cwan@x.ai>
This commit is contained in:
Yongtong Wu
2025-10-13 08:55:59 +08:00
committed by GitHub
parent 1bdd010291
commit a20e7df8d0
4 changed files with 175 additions and 44 deletions

View File

@@ -1291,8 +1291,46 @@ def pytorch_profile(name, func, *args, data_size=-1):
def get_zmq_socket(
context: zmq.Context, socket_type: zmq.SocketType, endpoint: str, bind: bool
) -> zmq.Socket:
context: zmq.Context,
socket_type: zmq.SocketType,
endpoint: Optional[str] = None,
bind: bool = True,
) -> Union[zmq.Socket, Tuple[int, zmq.Socket]]:
"""Create and configure a ZeroMQ socket.
Args:
context: ZeroMQ context to create the socket from.
socket_type: Type of ZeroMQ socket to create.
endpoint: Optional endpoint to bind/connect to. If None, binds to a random TCP port.
bind: Whether to bind (True) or connect (False) to the endpoint. Ignored if endpoint is None.
Returns:
If endpoint is None: Tuple of (port, socket) where port is the randomly assigned TCP port.
If endpoint is provided: The configured ZeroMQ socket.
"""
socket = context.socket(socket_type)
if endpoint is None:
# Bind to random TCP port
config_socket(socket, socket_type)
port = socket.bind_to_random_port("tcp://*")
return port, socket
else:
# Handle IPv6 if endpoint contains brackets
if endpoint.find("[") != -1:
socket.setsockopt(zmq.IPV6, 1)
config_socket(socket, socket_type)
if bind:
socket.bind(endpoint)
else:
socket.connect(endpoint)
return socket
def config_socket(socket, socket_type: zmq.SocketType):
mem = psutil.virtual_memory()
total_mem = mem.total / 1024**3
available_mem = mem.available / 1024**3
@@ -1301,10 +1339,6 @@ def get_zmq_socket(
else:
buf_size = -1
socket = context.socket(socket_type)
if endpoint.find("[") != -1:
socket.setsockopt(zmq.IPV6, 1)
def set_send_opt():
socket.setsockopt(zmq.SNDHWM, 0)
socket.setsockopt(zmq.SNDBUF, buf_size)
@@ -1317,19 +1351,12 @@ def get_zmq_socket(
set_send_opt()
elif socket_type == zmq.PULL:
set_recv_opt()
elif socket_type == zmq.DEALER:
elif socket_type in [zmq.DEALER, zmq.REQ, zmq.REP]:
set_send_opt()
set_recv_opt()
else:
raise ValueError(f"Unsupported socket type: {socket_type}")
if bind:
socket.bind(endpoint)
else:
socket.connect(endpoint)
return socket
def dump_to_file(dirpath, name, value):
from sglang.srt.distributed import get_tensor_model_parallel_rank