diff --git a/python/sglang/bench_one_batch.py b/python/sglang/bench_one_batch.py index 863bc5839..9bbe9b0f1 100644 --- a/python/sglang/bench_one_batch.py +++ b/python/sglang/bench_one_batch.py @@ -47,6 +47,7 @@ import itertools import json import logging import multiprocessing +import os import time from typing import Tuple @@ -62,11 +63,7 @@ from sglang.srt.model_executor.model_runner import ModelRunner from sglang.srt.sampling.sampling_params import SamplingParams from sglang.srt.server import _set_envs_and_config from sglang.srt.server_args import PortArgs, ServerArgs -from sglang.srt.utils import ( - configure_logger, - kill_child_process, - suppress_other_loggers, -) +from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers @dataclasses.dataclass @@ -468,4 +465,4 @@ if __name__ == "__main__": main(server_args, bench_args) finally: if server_args.tp_size != 1: - kill_child_process() + kill_process_tree(os.getpid(), include_parent=False) diff --git a/python/sglang/bench_one_batch_server.py b/python/sglang/bench_one_batch_server.py index 9d6048bc1..01cc561e1 100644 --- a/python/sglang/bench_one_batch_server.py +++ b/python/sglang/bench_one_batch_server.py @@ -15,6 +15,7 @@ import dataclasses import itertools import json import multiprocessing +import os import time from typing import Tuple @@ -23,7 +24,7 @@ import requests from sglang.srt.server import launch_server from sglang.srt.server_args import ServerArgs -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree @dataclasses.dataclass @@ -69,7 +70,7 @@ def launch_server_internal(server_args): except Exception as e: raise e finally: - kill_child_process() + kill_process_tree(os.getpid(), include_parent=False) def launch_server_process(server_args: ServerArgs): @@ -175,7 +176,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs): ) finally: if proc: - kill_child_process(proc.pid, include_self=True) + kill_process_tree(proc.pid) print(f"\nResults are saved to {bench_args.result_filename}") diff --git a/python/sglang/launch_server.py b/python/sglang/launch_server.py index 3e2cd4a97..b2ad1b320 100644 --- a/python/sglang/launch_server.py +++ b/python/sglang/launch_server.py @@ -4,7 +4,7 @@ import sys from sglang.srt.server import launch_server from sglang.srt.server_args import prepare_server_args -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree if __name__ == "__main__": server_args = prepare_server_args(sys.argv[1:]) @@ -12,4 +12,4 @@ if __name__ == "__main__": try: launch_server(server_args) finally: - kill_child_process() + kill_process_tree(os.getpid(), include_parent=False) diff --git a/python/sglang/srt/managers/data_parallel_controller.py b/python/sglang/srt/managers/data_parallel_controller.py index d4730e3f7..8edb79417 100644 --- a/python/sglang/srt/managers/data_parallel_controller.py +++ b/python/sglang/srt/managers/data_parallel_controller.py @@ -15,9 +15,11 @@ import logging import multiprocessing as mp +import signal import threading from enum import Enum, auto +import psutil import zmq from sglang.srt.managers.io_struct import ( @@ -26,13 +28,7 @@ from sglang.srt.managers.io_struct import ( ) from sglang.srt.managers.scheduler import run_scheduler_process from sglang.srt.server_args import PortArgs, ServerArgs -from sglang.srt.utils import ( - bind_port, - configure_logger, - get_zmq_socket, - kill_parent_process, - suppress_other_loggers, -) +from sglang.srt.utils import bind_port, configure_logger, get_zmq_socket from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) @@ -235,7 +231,7 @@ def run_data_parallel_controller_process( pipe_writer, ): configure_logger(server_args) - suppress_other_loggers() + parent_process = psutil.Process().parent() try: controller = DataParallelController(server_args, port_args) @@ -244,6 +240,6 @@ def run_data_parallel_controller_process( ) controller.event_loop() except Exception: - msg = get_exception_traceback() - logger.error(msg) - kill_parent_process() + traceback = get_exception_traceback() + logger.error(f"DataParallelController hit an exception: {traceback}") + parent_process.send_signal(signal.SIGQUIT) diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index 18f77424d..e74ba5026 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -15,9 +15,11 @@ import dataclasses import logging +import signal from collections import OrderedDict from typing import List, Union +import psutil import zmq from sglang.srt.hf_transformers_utils import get_tokenizer @@ -28,7 +30,7 @@ from sglang.srt.managers.io_struct import ( ) from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN from sglang.srt.server_args import PortArgs, ServerArgs -from sglang.srt.utils import configure_logger, get_zmq_socket, kill_parent_process +from sglang.srt.utils import configure_logger, get_zmq_socket from sglang.utils import find_printable_text, get_exception_traceback logger = logging.getLogger(__name__) @@ -193,11 +195,12 @@ def run_detokenizer_process( port_args: PortArgs, ): configure_logger(server_args) + parent_process = psutil.Process().parent() try: manager = DetokenizerManager(server_args, port_args) manager.event_loop() except Exception: - msg = get_exception_traceback() - logger.error(msg) - kill_parent_process() + traceback = get_exception_traceback() + logger.error(f"DetokenizerManager hit an exception: {traceback}") + parent_process.send_signal(signal.SIGQUIT) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 2563bb559..a327f37a2 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -15,6 +15,7 @@ import logging import os +import signal import threading import time import warnings @@ -23,6 +24,7 @@ from concurrent import futures from types import SimpleNamespace from typing import List, Optional +import psutil import torch import zmq @@ -73,7 +75,6 @@ from sglang.srt.utils import ( crash_on_warnings, get_bool_env_var, get_zmq_socket, - kill_parent_process, set_gpu_proc_affinity, set_random_seed, suppress_other_loggers, @@ -316,6 +317,7 @@ class Scheduler: self.watchdog_timeout = server_args.watchdog_timeout t = threading.Thread(target=self.watchdog_thread, daemon=True) t.start() + self.parent_process = psutil.Process().parent() # Init profiler if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "": @@ -359,7 +361,7 @@ class Scheduler: self.watchdog_last_time = time.time() time.sleep(self.watchdog_timeout / 2) - kill_parent_process() + self.parent_process.send_signal(signal.SIGQUIT) @torch.no_grad() def event_loop_normal(self): @@ -1423,6 +1425,7 @@ def run_scheduler_process( configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}") suppress_other_loggers() + parent_process = psutil.Process().parent() try: scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank) @@ -1434,6 +1437,6 @@ def run_scheduler_process( else: scheduler.event_loop_normal() except Exception: - msg = get_exception_traceback() - logger.error(msg) - kill_parent_process() + traceback = get_exception_traceback() + logger.error(f"Scheduler hit an exception: {traceback}") + parent_process.send_signal(signal.SIGQUIT) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 001ecc1eb..15518e9e5 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -58,7 +58,7 @@ from sglang.srt.managers.io_struct import ( from sglang.srt.metrics.collector import TokenizerMetricsCollector from sglang.srt.sampling.sampling_params import SamplingParams from sglang.srt.server_args import PortArgs, ServerArgs -from sglang.srt.utils import get_zmq_socket, kill_child_process +from sglang.srt.utils import get_zmq_socket, kill_process_tree asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) @@ -532,7 +532,7 @@ class TokenizerManager: else: break - kill_child_process(include_self=True) + kill_process_tree(os.getpid(), include_parent=True) sys.exit(0) async def handle_loop(self): diff --git a/python/sglang/srt/managers/tp_worker_overlap_thread.py b/python/sglang/srt/managers/tp_worker_overlap_thread.py index 3b53759a7..a5412094c 100644 --- a/python/sglang/srt/managers/tp_worker_overlap_thread.py +++ b/python/sglang/srt/managers/tp_worker_overlap_thread.py @@ -15,16 +15,19 @@ import dataclasses import logging +import signal import threading from queue import Queue from typing import Optional +import psutil import torch from sglang.srt.managers.io_struct import UpdateWeightReqInput from sglang.srt.managers.schedule_batch import ModelWorkerBatch from sglang.srt.managers.tp_worker import TpModelWorker from sglang.srt.server_args import ServerArgs +from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) @@ -70,6 +73,7 @@ class TpModelWorkerClient: target=self.forward_thread_func, ) self.forward_thread.start() + self.parent_process = psutil.Process().parent() def get_worker_info(self): return self.worker.get_worker_info() @@ -87,8 +91,13 @@ class TpModelWorkerClient: ) def forward_thread_func(self): - with torch.cuda.stream(self.forward_stream): - self.forward_thread_func_() + try: + with torch.cuda.stream(self.forward_stream): + self.forward_thread_func_() + except Exception: + traceback = get_exception_traceback() + logger.error(f"TpModelWorkerClient hit an exception: {traceback}") + self.parent_process.send_signal(signal.SIGQUIT) @torch.no_grad() def forward_thread_func_(self): diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index a4753a134..c95893067 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -23,6 +23,8 @@ import json import logging import multiprocessing as mp import os +import signal +import sys import threading import time from http import HTTPStatus @@ -79,7 +81,7 @@ from sglang.srt.utils import ( configure_logger, delete_directory, is_port_available, - kill_child_process, + kill_process_tree, maybe_set_triton_cache_manager, prepare_model_and_tokenizer, set_prometheus_multiproc_dir, @@ -572,6 +574,15 @@ def _set_envs_and_config(server_args: ServerArgs): "at https://docs.flashinfer.ai/installation.html.", ) + # Register the signal handler. + # The child processes will send SIGQUIT to this process when any error happens + # This process then clean up the whole process tree + def sigquit_handler(signum, frame): + kill_process_tree(os.getpid()) + + signal.signal(signal.SIGQUIT, sigquit_handler) + + # Set mp start method mp.set_start_method("spawn", force=True) @@ -598,7 +609,7 @@ def _wait_and_warmup(server_args, pipe_finish_writer): if pipe_finish_writer is not None: pipe_finish_writer.send(last_traceback) logger.error(f"Initialization failed. warmup error: {last_traceback}") - kill_child_process(include_self=True) + kill_process_tree(os.getpid()) return model_info = res.json() @@ -631,7 +642,7 @@ def _wait_and_warmup(server_args, pipe_finish_writer): if pipe_finish_writer is not None: pipe_finish_writer.send(last_traceback) logger.error(f"Initialization failed. warmup error: {last_traceback}") - kill_child_process(include_self=True) + kill_process_tree(os.getpid()) return # logger.info(f"{res.json()=}") @@ -700,7 +711,7 @@ class Runtime: def shutdown(self): if self.pid is not None: - kill_child_process(self.pid, include_self=True) + kill_process_tree(self.pid) self.pid = None def cache_prefix(self, prefix: str): @@ -924,7 +935,7 @@ class Engine: return ret def shutdown(self): - kill_child_process() + kill_process_tree(os.getpid(), include_parent=False) def get_tokenizer(self): global tokenizer_manager diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 4a974e2e7..46b4db8e8 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -443,26 +443,14 @@ def assert_pkg_version(pkg: str, min_version: str, message: str): ) -def kill_parent_process(): - """Kill the parent process and all children of the parent process.""" - current_process = psutil.Process() - parent_process = current_process.parent() - kill_child_process( - parent_process.pid, include_self=True, skip_pid=current_process.pid - ) - try: - current_process.kill() - except psutil.NoSuchProcess: - pass - - -def kill_child_process(pid=None, include_self=False, skip_pid=None): - """Kill the process and all its children process.""" - if pid is None: - pid = os.getpid() +def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = None): + """Kill the process and all its child processes.""" + if parent_pid is None: + parent_pid = os.getpid() + include_parent = False try: - itself = psutil.Process(pid) + itself = psutil.Process(parent_pid) except psutil.NoSuchProcess: return @@ -475,13 +463,13 @@ def kill_child_process(pid=None, include_self=False, skip_pid=None): except psutil.NoSuchProcess: pass - if include_self: + if include_parent: try: itself.kill() # Sometime processes cannot be killed with SIGKILL (e.g, PID=1 launched by kubernetes), # so we send an additional signal to kill them. - itself.send_signal(signal.SIGINT) + itself.send_signal(signal.SIGQUIT) except psutil.NoSuchProcess: pass diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 308966844..3f6cce23d 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -22,7 +22,7 @@ from sglang.bench_serving import run_benchmark from sglang.global_config import global_config from sglang.lang.backend.openai import OpenAI from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint -from sglang.srt.utils import get_bool_env_var, kill_child_process +from sglang.srt.utils import get_bool_env_var, kill_process_tree from sglang.test.run_eval import run_eval from sglang.utils import get_exception_traceback @@ -504,7 +504,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float): ) assert ret_code == 0 except TimeoutError: - kill_child_process(process.pid, include_self=True) + kill_process_tree(process.pid) time.sleep(5) print( f"\nTimeout after {timeout_per_file} seconds when running {filename}\n", @@ -578,7 +578,7 @@ def run_bench_serving( run_benchmark(warmup_args) res = run_benchmark(args) finally: - kill_child_process(process.pid, include_self=True) + kill_process_tree(process.pid) assert res["completed"] == num_prompts return res @@ -611,7 +611,7 @@ def run_bench_one_batch(model, other_args): lastline = output.split("\n")[-3] output_throughput = float(lastline.split(" ")[-2]) finally: - kill_child_process(process.pid, include_self=True) + kill_process_tree(process.pid) return output_throughput @@ -710,8 +710,8 @@ def run_and_check_memory_leak( workload_func(base_url, model) # Clean up everything - kill_child_process(process.pid, include_self=True) - kill_child_process(process.pid, include_self=True) + kill_process_tree(process.pid) + kill_process_tree(process.pid) stdout.close() stderr.close() if os.path.exists(STDOUT_FILENAME): diff --git a/python/sglang/utils.py b/python/sglang/utils.py index e694dc198..c1bf62ef9 100644 --- a/python/sglang/utils.py +++ b/python/sglang/utils.py @@ -348,9 +348,9 @@ def wait_for_server(base_url: str, timeout: int = None) -> None: def terminate_process(process): - from sglang.srt.utils import kill_child_process + from sglang.srt.utils import kill_process_tree - kill_child_process(process.pid, include_self=True) + kill_process_tree(process.pid) def print_highlight(html_content: str): diff --git a/rust/py_test/test_launch_server.py b/rust/py_test/test_launch_server.py index 7fdaea6b1..f39b341df 100644 --- a/rust/py_test/test_launch_server.py +++ b/rust/py_test/test_launch_server.py @@ -5,7 +5,7 @@ from types import SimpleNamespace import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -79,7 +79,7 @@ class TestEvalAccuracyMini(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py b/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py index 689d52a1c..0eccb3407 100644 --- a/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py +++ b/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py @@ -4,7 +4,7 @@ from multiprocessing import Process import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -31,7 +31,7 @@ class TestBatchPenalizerE2E(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def run_decode( self, diff --git a/test/srt/test_cache_report.py b/test/srt/test_cache_report.py index 5d498ac3f..f128aa147 100644 --- a/test/srt/test_cache_report.py +++ b/test/srt/test_cache_report.py @@ -4,7 +4,7 @@ import unittest import openai import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_URL_FOR_TEST, @@ -44,7 +44,7 @@ class TestCacheReport(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1): response = requests.post( diff --git a/test/srt/test_data_parallelism.py b/test/srt/test_data_parallelism.py index f34313ea0..22d000664 100644 --- a/test/srt/test_data_parallelism.py +++ b/test/srt/test_data_parallelism.py @@ -4,7 +4,7 @@ from types import SimpleNamespace import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -28,7 +28,7 @@ class TestDataParallelism(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_double_sparsity.py b/test/srt/test_double_sparsity.py index 20896aff2..060a7926f 100644 --- a/test/srt/test_double_sparsity.py +++ b/test/srt/test_double_sparsity.py @@ -2,7 +2,7 @@ import os import unittest from types import SimpleNamespace -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -45,7 +45,7 @@ class TestDoubleSparsity(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_dp_attention.py b/test/srt/test_dp_attention.py index 32fe75a59..31c9cc71b 100644 --- a/test/srt/test_dp_attention.py +++ b/test/srt/test_dp_attention.py @@ -1,7 +1,7 @@ import unittest from types import SimpleNamespace -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MLA_MODEL_NAME_FOR_TEST, @@ -30,7 +30,7 @@ class TestDPAttention(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_embedding_openai_server.py b/test/srt/test_embedding_openai_server.py index 666297c65..8097bf42c 100644 --- a/test/srt/test_embedding_openai_server.py +++ b/test/srt/test_embedding_openai_server.py @@ -3,7 +3,7 @@ import unittest import openai from sglang.srt.hf_transformers_utils import get_tokenizer -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, @@ -28,7 +28,7 @@ class TestOpenAIServer(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def run_embedding(self, use_list_input, token_input): client = openai.Client(api_key=self.api_key, base_url=self.base_url) diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py index 318390d10..f7fb3cec3 100644 --- a/test/srt/test_eval_accuracy_large.py +++ b/test/srt/test_eval_accuracy_large.py @@ -6,7 +6,7 @@ python -m unittest test_eval_accuracy_large.TestEvalAccuracyLarge.test_mmlu import unittest from types import SimpleNamespace -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -30,7 +30,7 @@ class TestEvalAccuracyLarge(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py index 2e9ff59cd..c8ce5cff2 100644 --- a/test/srt/test_eval_accuracy_large_chunked_prefill.py +++ b/test/srt/test_eval_accuracy_large_chunked_prefill.py @@ -1,7 +1,7 @@ import unittest from types import SimpleNamespace -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -25,7 +25,7 @@ class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py index 0fb08e64f..3bc115874 100644 --- a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py +++ b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py @@ -1,7 +1,7 @@ import unittest from types import SimpleNamespace -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -31,7 +31,7 @@ class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_eval_accuracy_mini.py b/test/srt/test_eval_accuracy_mini.py index a718feff7..a008c3869 100644 --- a/test/srt/test_eval_accuracy_mini.py +++ b/test/srt/test_eval_accuracy_mini.py @@ -1,7 +1,7 @@ import unittest from types import SimpleNamespace -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -22,7 +22,7 @@ class TestEvalAccuracyMini(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_input_embeddings.py b/test/srt/test_input_embeddings.py index b57b61dad..04d54c6bb 100644 --- a/test/srt/test_input_embeddings.py +++ b/test/srt/test_input_embeddings.py @@ -4,7 +4,7 @@ import unittest import requests from transformers import AutoModelForCausalLM, AutoTokenizer -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -107,7 +107,7 @@ class TestInputEmbeds(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) if __name__ == "__main__": diff --git a/test/srt/test_json_constrained.py b/test/srt/test_json_constrained.py index ae27b036f..28acdabd9 100644 --- a/test/srt/test_json_constrained.py +++ b/test/srt/test_json_constrained.py @@ -9,7 +9,7 @@ from concurrent.futures import ThreadPoolExecutor import openai import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_URL_FOR_TEST, @@ -46,7 +46,7 @@ class TestJSONConstrainedOutlinesBackend(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def run_decode(self, json_schema, return_logprob=False, top_logprobs_num=0, n=1): response = requests.post( diff --git a/test/srt/test_large_max_new_tokens.py b/test/srt/test_large_max_new_tokens.py index 5ed2b06fc..dcaeef5aa 100644 --- a/test/srt/test_large_max_new_tokens.py +++ b/test/srt/test_large_max_new_tokens.py @@ -10,7 +10,7 @@ from concurrent.futures import ThreadPoolExecutor import openai from sglang.srt.hf_transformers_utils import get_tokenizer -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -52,7 +52,7 @@ class TestLargeMaxNewTokens(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) cls.stdout.close() cls.stderr.close() os.remove(STDOUT_FILENAME) diff --git a/test/srt/test_matched_stop.py b/test/srt/test_matched_stop.py index 81d08b091..7b09a6d35 100644 --- a/test/srt/test_matched_stop.py +++ b/test/srt/test_matched_stop.py @@ -3,7 +3,7 @@ import unittest import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_URL_FOR_TEST, @@ -32,7 +32,7 @@ class TestMatchedStop(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def run_completions_generation( self, diff --git a/test/srt/test_metrics.py b/test/srt/test_metrics.py index 163a7cc0e..3b73e500d 100644 --- a/test/srt/test_metrics.py +++ b/test/srt/test_metrics.py @@ -2,7 +2,7 @@ import unittest import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -75,7 +75,7 @@ class TestEnableMetrics(unittest.TestCase): self.assertIn("_bucket{", metrics_content) finally: - kill_child_process(process.pid, include_self=True) + kill_process_tree(process.pid) if __name__ == "__main__": diff --git a/test/srt/test_mla.py b/test/srt/test_mla.py index a11be3950..b8105a84a 100644 --- a/test/srt/test_mla.py +++ b/test/srt/test_mla.py @@ -1,7 +1,7 @@ import unittest from types import SimpleNamespace -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MLA_MODEL_NAME_FOR_TEST, @@ -25,7 +25,7 @@ class TestMLA(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_mla_fp8.py b/test/srt/test_mla_fp8.py index 5091759a9..769bdf34d 100644 --- a/test/srt/test_mla_fp8.py +++ b/test/srt/test_mla_fp8.py @@ -1,7 +1,7 @@ import unittest from types import SimpleNamespace -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST, @@ -31,7 +31,7 @@ class TestMLA(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mgsm_en(self): args = SimpleNamespace( diff --git a/test/srt/test_moe_eval_accuracy_large.py b/test/srt/test_moe_eval_accuracy_large.py index 9880a8162..6f3affbba 100644 --- a/test/srt/test_moe_eval_accuracy_large.py +++ b/test/srt/test_moe_eval_accuracy_large.py @@ -6,7 +6,7 @@ python -m unittest test_moe_eval_accuracy_large.TestMoEEvalAccuracyLarge.test_mm import unittest from types import SimpleNamespace -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MOE_MODEL_NAME_FOR_TEST, @@ -35,7 +35,7 @@ class TestMoEEvalAccuracyLarge(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py index 7c208e84b..8466c2c64 100644 --- a/test/srt/test_nightly_gsm8k_eval.py +++ b/test/srt/test_nightly_gsm8k_eval.py @@ -6,7 +6,7 @@ import warnings from datetime import datetime from types import SimpleNamespace -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1, @@ -132,7 +132,7 @@ class TestEvalAccuracyLarge(unittest.TestCase): def tearDown(self): if self.process: - kill_child_process(self.process.pid, include_self=True) + kill_process_tree(self.process.pid) def test_mgsm_en_all_models(self): warnings.filterwarnings( diff --git a/test/srt/test_nightly_human_eval.py b/test/srt/test_nightly_human_eval.py index f69bbe132..626e6fb15 100644 --- a/test/srt/test_nightly_human_eval.py +++ b/test/srt/test_nightly_human_eval.py @@ -6,7 +6,7 @@ import unittest from test_nightly_gsm8k_eval import launch_server, parse_models -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1, DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2, @@ -32,9 +32,9 @@ class TestEvalAccuracyLarge(unittest.TestCase): @classmethod def tearDownClass(cls): if cls.process: - kill_child_process(cls.process.pid) + kill_process_tree(cls.process.pid) if cls.eval_process: - kill_child_process(cls.eval_process.pid) + kill_process_tree(cls.eval_process.pid) def run_evalplus(self, model): print("Delete evalplus results") diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index 1e18e23ef..d007bed31 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -11,7 +11,7 @@ import unittest import openai from sglang.srt.hf_transformers_utils import get_tokenizer -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -37,7 +37,7 @@ class TestOpenAIServer(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def run_completion( self, echo, logprobs, use_list_input, parallel_sample_num, token_input diff --git a/test/srt/test_pytorch_sampling_backend.py b/test/srt/test_pytorch_sampling_backend.py index 9aa6c3300..4f1403e0a 100644 --- a/test/srt/test_pytorch_sampling_backend.py +++ b/test/srt/test_pytorch_sampling_backend.py @@ -3,7 +3,7 @@ from types import SimpleNamespace import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -27,7 +27,7 @@ class TestPyTorchSamplingBackend(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_radix_attention.py b/test/srt/test_radix_attention.py index cdba7573d..207303c8c 100644 --- a/test/srt/test_radix_attention.py +++ b/test/srt/test_radix_attention.py @@ -8,7 +8,7 @@ from sglang.test.test_utils import ( DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, - kill_child_process, + kill_process_tree, popen_launch_server, ) @@ -80,7 +80,7 @@ class TestRadixCacheFCFS(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_radix_attention(self): nodes = gen_radix_tree() diff --git a/test/srt/test_retract_decode.py b/test/srt/test_retract_decode.py index 834c51f9d..5f169cdb6 100644 --- a/test/srt/test_retract_decode.py +++ b/test/srt/test_retract_decode.py @@ -1,7 +1,7 @@ import unittest from types import SimpleNamespace -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -22,7 +22,7 @@ class TestRetractDecode(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_session_control.py b/test/srt/test_session_control.py index 7396779f6..8558b4249 100644 --- a/test/srt/test_session_control.py +++ b/test/srt/test_session_control.py @@ -9,7 +9,7 @@ import unittest import requests from sglang.srt.hf_transformers_utils import get_tokenizer -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -29,7 +29,7 @@ class TestSessionControl(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_session_control(self): chunks = [ @@ -191,7 +191,7 @@ class TestSessionControlVision(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_session_control(self): text_chunks = [ diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py index 7ec73b15d..bc99b23ad 100644 --- a/test/srt/test_skip_tokenizer_init.py +++ b/test/srt/test_skip_tokenizer_init.py @@ -7,7 +7,7 @@ import unittest import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -30,7 +30,7 @@ class TestSkipTokenizerInit(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1): max_new_tokens = 32 diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py index fb50943f1..006059e03 100644 --- a/test/srt/test_srt_endpoint.py +++ b/test/srt/test_srt_endpoint.py @@ -9,7 +9,7 @@ import unittest import numpy as np import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -29,7 +29,7 @@ class TestSRTEndpoint(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def run_decode( self, diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py index 76945f963..6f3b344b3 100644 --- a/test/srt/test_torch_compile.py +++ b/test/srt/test_torch_compile.py @@ -4,7 +4,7 @@ from types import SimpleNamespace import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -28,7 +28,7 @@ class TestTorchCompile(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_torch_compile_moe.py b/test/srt/test_torch_compile_moe.py index e744e6686..89d4ed6bd 100644 --- a/test/srt/test_torch_compile_moe.py +++ b/test/srt/test_torch_compile_moe.py @@ -4,7 +4,7 @@ from types import SimpleNamespace import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST, @@ -28,7 +28,7 @@ class TestTorchCompile(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_torchao.py b/test/srt/test_torchao.py index 2a2fcb8df..a6414c60b 100644 --- a/test/srt/test_torchao.py +++ b/test/srt/test_torchao.py @@ -3,7 +3,7 @@ from types import SimpleNamespace import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -27,7 +27,7 @@ class TestTorchAO(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_triton_attention_backend.py b/test/srt/test_triton_attention_backend.py index a4d19bec0..905590965 100644 --- a/test/srt/test_triton_attention_backend.py +++ b/test/srt/test_triton_attention_backend.py @@ -6,7 +6,7 @@ python3 -m unittest test_triton_attention_backend.TestTritonAttnBackend.test_mml import unittest from types import SimpleNamespace -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -54,7 +54,7 @@ class TestTritonAttnBackend(unittest.TestCase): metrics = run_eval(args) self.assertGreaterEqual(metrics["score"], 0.65) finally: - kill_child_process(process.pid, include_self=True) + kill_process_tree(process.pid) if __name__ == "__main__": diff --git a/test/srt/test_update_weights.py b/test/srt/test_update_weights.py index 327da729a..ddb5a5e08 100644 --- a/test/srt/test_update_weights.py +++ b/test/srt/test_update_weights.py @@ -3,7 +3,7 @@ import unittest import requests -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -23,7 +23,7 @@ class TestUpdateWeights(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def run_decode(self): response = requests.post( diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index 95a1624cf..e19e6b01d 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -17,7 +17,7 @@ import requests from decord import VideoReader, cpu from PIL import Image -from sglang.srt.utils import kill_child_process +from sglang.srt.utils import kill_process_tree from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, @@ -46,7 +46,7 @@ class TestOpenAIVisionServer(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_chat_completion(self): client = openai.Client(api_key=self.api_key, base_url=self.base_url) @@ -387,7 +387,7 @@ class TestQWen2VLServerContextLengthIssue(unittest.TestCase): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid, include_self=True) + kill_process_tree(cls.process.pid) def test_chat_completion(self): client = openai.Client(api_key=self.api_key, base_url=self.base_url)