From bd7cfbd2f852c1a55b83c95163526e04971ebab9 Mon Sep 17 00:00:00 2001 From: Povilas Kanapickas Date: Fri, 13 Jun 2025 00:58:22 +0300 Subject: [PATCH] [Fix] Reduce busy polling when scheduler is idle (#6026) --- docs/backend/server_arguments.md | 2 +- python/sglang/srt/disaggregation/decode.py | 2 ++ python/sglang/srt/disaggregation/prefill.py | 2 ++ python/sglang/srt/managers/scheduler.py | 37 +++++++++++++++++++++ python/sglang/srt/server_args.py | 6 ++++ 5 files changed, 48 insertions(+), 1 deletion(-) diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md index 0c7462ba4..9ce110233 100644 --- a/docs/backend/server_arguments.md +++ b/docs/backend/server_arguments.md @@ -107,7 +107,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--download-dir` | Model download directory for huggingface. | None | | `--base-gpu-id` | The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine. | 0 | | `--gpu-id-step` | The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,.... | 1 | - +| `--sleep-on-idle` | Reduce CPU usage when sglang is idle. | False | ## Logging diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py index 12b4408fd..99b2bf330 100644 --- a/python/sglang/srt/disaggregation/decode.py +++ b/python/sglang/srt/disaggregation/decode.py @@ -550,6 +550,7 @@ class SchedulerDisaggregationDecodeMixin: # When the server is idle, do self-check and re-init some states self.check_memory() self.new_token_ratio = self.init_new_token_ratio + self.maybe_sleep_on_idle() self.last_batch = batch @@ -628,6 +629,7 @@ class SchedulerDisaggregationDecodeMixin: # When the server is idle, do self-check and re-init some states self.check_memory() self.new_token_ratio = self.init_new_token_ratio + self.maybe_sleep_on_idle() self.last_batch = batch self.last_batch_in_queue = last_batch_in_queue diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py index 5a416e896..a008b404f 100644 --- a/python/sglang/srt/disaggregation/prefill.py +++ b/python/sglang/srt/disaggregation/prefill.py @@ -242,6 +242,7 @@ class SchedulerDisaggregationPrefillMixin: if batch is None and len(self.disagg_prefill_inflight_queue) == 0: self.check_memory() self.new_token_ratio = self.init_new_token_ratio + self.maybe_sleep_on_idle() self.last_batch = batch # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it @@ -294,6 +295,7 @@ class SchedulerDisaggregationPrefillMixin: if batch is None and len(self.disagg_prefill_inflight_queue) == 0: self.check_memory() self.new_token_ratio = self.init_new_token_ratio + self.maybe_sleep_on_idle() self.last_batch = batch # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index efef841e7..37f39096c 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -179,6 +179,27 @@ class EmbeddingBatchResult: bid: int +class IdleSleeper: + """ + In setups which have long inactivity periods it is desirable to reduce + system power consumption when sglang does nothing. This would lead not only + to power savings, but also to more CPU thermal headroom when a request + eventually comes. This is important in cases when multiple GPUs are connected + as each GPU would otherwise pin one thread at 100% CPU usage. + + The simplest solution is to use zmq.Poller on all sockets that may receive + data that needs handling immediately. + """ + + def __init__(self, sockets): + self.poller = zmq.Poller() + for s in sockets: + self.poller.register(s, zmq.POLLIN) + + def maybe_sleep(self): + self.poller.poll(1000) + + class Scheduler( SchedulerOutputProcessorMixin, SchedulerDisaggregationDecodeMixin, @@ -228,6 +249,8 @@ class Scheduler( # Init inter-process communication context = zmq.Context(2) + self.idle_sleeper = None + if self.pp_rank == 0 and self.attn_tp_rank == 0: self.recv_from_tokenizer = get_zmq_socket( context, zmq.PULL, port_args.scheduler_input_ipc_name, False @@ -250,6 +273,13 @@ class Scheduler( self.recv_from_rpc = get_zmq_socket( context, zmq.DEALER, port_args.rpc_ipc_name, False ) + if self.server_args.sleep_on_idle: + self.idle_sleeper = IdleSleeper( + [ + self.recv_from_tokenizer, + self.recv_from_rpc, + ] + ) else: self.recv_from_tokenizer = None self.recv_from_rpc = None @@ -478,6 +508,10 @@ class Scheduler( ) self.init_disaggregation() + def maybe_sleep_on_idle(self): + if self.idle_sleeper is not None: + self.idle_sleeper.maybe_sleep() + def init_tokenizer(self): server_args = self.server_args @@ -667,6 +701,7 @@ class Scheduler( # When the server is idle, do self-check and re-init some states self.check_memory() self.new_token_ratio = self.init_new_token_ratio + self.maybe_sleep_on_idle() self.last_batch = batch @@ -711,6 +746,7 @@ class Scheduler( # When the server is idle, do self-check and re-init some states self.check_memory() self.new_token_ratio = self.init_new_token_ratio + self.maybe_sleep_on_idle() self.last_batch = batch @@ -816,6 +852,7 @@ class Scheduler( if server_is_idle: self.check_memory() self.new_token_ratio = self.init_new_token_ratio + self.maybe_sleep_on_idle() def recv_requests(self) -> List[Req]: """Receive results at tp_rank = 0 and broadcast it to all other TP ranks.""" diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index b4931f7f3..d0a97eb6a 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -90,6 +90,7 @@ class ServerArgs: download_dir: Optional[str] = None base_gpu_id: int = 0 gpu_id_step: int = 1 + sleep_on_idle: bool = False # Logging log_level: str = "info" @@ -844,6 +845,11 @@ class ServerArgs: default=ServerArgs.gpu_id_step, help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...", ) + parser.add_argument( + "--sleep-on-idle", + action="store_true", + help="Reduce CPU usage when sglang is idle.", + ) # Logging parser.add_argument(