Improve dp attention port assignment scheme (#5889)
Co-authored-by: Cheng Wan <cwan@x.ai>
This commit is contained in:
@@ -1291,8 +1291,46 @@ def pytorch_profile(name, func, *args, data_size=-1):
|
||||
|
||||
|
||||
def get_zmq_socket(
|
||||
context: zmq.Context, socket_type: zmq.SocketType, endpoint: str, bind: bool
|
||||
) -> zmq.Socket:
|
||||
context: zmq.Context,
|
||||
socket_type: zmq.SocketType,
|
||||
endpoint: Optional[str] = None,
|
||||
bind: bool = True,
|
||||
) -> Union[zmq.Socket, Tuple[int, zmq.Socket]]:
|
||||
"""Create and configure a ZeroMQ socket.
|
||||
|
||||
Args:
|
||||
context: ZeroMQ context to create the socket from.
|
||||
socket_type: Type of ZeroMQ socket to create.
|
||||
endpoint: Optional endpoint to bind/connect to. If None, binds to a random TCP port.
|
||||
bind: Whether to bind (True) or connect (False) to the endpoint. Ignored if endpoint is None.
|
||||
|
||||
Returns:
|
||||
If endpoint is None: Tuple of (port, socket) where port is the randomly assigned TCP port.
|
||||
If endpoint is provided: The configured ZeroMQ socket.
|
||||
"""
|
||||
socket = context.socket(socket_type)
|
||||
|
||||
if endpoint is None:
|
||||
# Bind to random TCP port
|
||||
config_socket(socket, socket_type)
|
||||
port = socket.bind_to_random_port("tcp://*")
|
||||
return port, socket
|
||||
else:
|
||||
# Handle IPv6 if endpoint contains brackets
|
||||
if endpoint.find("[") != -1:
|
||||
socket.setsockopt(zmq.IPV6, 1)
|
||||
|
||||
config_socket(socket, socket_type)
|
||||
|
||||
if bind:
|
||||
socket.bind(endpoint)
|
||||
else:
|
||||
socket.connect(endpoint)
|
||||
|
||||
return socket
|
||||
|
||||
|
||||
def config_socket(socket, socket_type: zmq.SocketType):
|
||||
mem = psutil.virtual_memory()
|
||||
total_mem = mem.total / 1024**3
|
||||
available_mem = mem.available / 1024**3
|
||||
@@ -1301,10 +1339,6 @@ def get_zmq_socket(
|
||||
else:
|
||||
buf_size = -1
|
||||
|
||||
socket = context.socket(socket_type)
|
||||
if endpoint.find("[") != -1:
|
||||
socket.setsockopt(zmq.IPV6, 1)
|
||||
|
||||
def set_send_opt():
|
||||
socket.setsockopt(zmq.SNDHWM, 0)
|
||||
socket.setsockopt(zmq.SNDBUF, buf_size)
|
||||
@@ -1317,19 +1351,12 @@ def get_zmq_socket(
|
||||
set_send_opt()
|
||||
elif socket_type == zmq.PULL:
|
||||
set_recv_opt()
|
||||
elif socket_type == zmq.DEALER:
|
||||
elif socket_type in [zmq.DEALER, zmq.REQ, zmq.REP]:
|
||||
set_send_opt()
|
||||
set_recv_opt()
|
||||
else:
|
||||
raise ValueError(f"Unsupported socket type: {socket_type}")
|
||||
|
||||
if bind:
|
||||
socket.bind(endpoint)
|
||||
else:
|
||||
socket.connect(endpoint)
|
||||
|
||||
return socket
|
||||
|
||||
|
||||
def dump_to_file(dirpath, name, value):
|
||||
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
||||
|
||||
Reference in New Issue
Block a user