[Fix] Orphan process in data parallel (#7995)
Signed-off-by: Capronir <839972205@qq.com>
This commit is contained in:
@@ -13,6 +13,7 @@
|
|||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
"""A controller that dispatches requests to multiple data parallel workers."""
|
"""A controller that dispatches requests to multiple data parallel workers."""
|
||||||
|
|
||||||
|
import faulthandler
|
||||||
import logging
|
import logging
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
import signal
|
import signal
|
||||||
@@ -39,7 +40,12 @@ from sglang.srt.managers.scheduler import run_scheduler_process
|
|||||||
from sglang.srt.managers.utils import DPBalanceMeta
|
from sglang.srt.managers.utils import DPBalanceMeta
|
||||||
from sglang.srt.server_args import PortArgs, ServerArgs
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
||||||
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
||||||
from sglang.srt.utils import bind_port, configure_logger, get_zmq_socket
|
from sglang.srt.utils import (
|
||||||
|
bind_port,
|
||||||
|
configure_logger,
|
||||||
|
get_zmq_socket,
|
||||||
|
kill_itself_when_parent_died,
|
||||||
|
)
|
||||||
from sglang.utils import get_exception_traceback
|
from sglang.utils import get_exception_traceback
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -343,7 +349,9 @@ def run_data_parallel_controller_process(
|
|||||||
port_args: PortArgs,
|
port_args: PortArgs,
|
||||||
pipe_writer,
|
pipe_writer,
|
||||||
):
|
):
|
||||||
|
kill_itself_when_parent_died()
|
||||||
setproctitle.setproctitle("sglang::data_parallel_controller")
|
setproctitle.setproctitle("sglang::data_parallel_controller")
|
||||||
|
faulthandler.enable()
|
||||||
configure_logger(server_args)
|
configure_logger(server_args)
|
||||||
parent_process = psutil.Process().parent()
|
parent_process = psutil.Process().parent()
|
||||||
balance_meta = DPBalanceMeta(server_args.dp_size)
|
balance_meta = DPBalanceMeta(server_args.dp_size)
|
||||||
|
|||||||
Reference in New Issue
Block a user