From 96784a65fd8b312502adb13fdcb18ccbcc8cce4d Mon Sep 17 00:00:00 2001 From: Caproni <40862361+Capronir@users.noreply.github.com> Date: Tue, 9 Sep 2025 11:09:09 +0800 Subject: [PATCH] [Fix] Orphan process in data parallel (#7995) Signed-off-by: Capronir <839972205@qq.com> --- python/sglang/srt/managers/data_parallel_controller.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/managers/data_parallel_controller.py b/python/sglang/srt/managers/data_parallel_controller.py index 76b9e1a01..677712a57 100644 --- a/python/sglang/srt/managers/data_parallel_controller.py +++ b/python/sglang/srt/managers/data_parallel_controller.py @@ -13,6 +13,7 @@ # ============================================================================== """A controller that dispatches requests to multiple data parallel workers.""" +import faulthandler import logging import multiprocessing as mp import signal @@ -39,7 +40,12 @@ from sglang.srt.managers.scheduler import run_scheduler_process from sglang.srt.managers.utils import DPBalanceMeta from sglang.srt.server_args import PortArgs, ServerArgs from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter -from sglang.srt.utils import bind_port, configure_logger, get_zmq_socket +from sglang.srt.utils import ( + bind_port, + configure_logger, + get_zmq_socket, + kill_itself_when_parent_died, +) from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) @@ -343,7 +349,9 @@ def run_data_parallel_controller_process( port_args: PortArgs, pipe_writer, ): + kill_itself_when_parent_died() setproctitle.setproctitle("sglang::data_parallel_controller") + faulthandler.enable() configure_logger(server_args) parent_process = psutil.Process().parent() balance_meta = DPBalanceMeta(server_args.dp_size)