From f49419061ddf6bd7d5af204ded781cb4e5351d23 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sun, 12 Oct 2025 21:29:31 +0800 Subject: [PATCH] Move args from `global_config` to `environ` (#11332) --- python/sglang/global_config.py | 23 +------------------ python/sglang/srt/environ.py | 9 ++++++++ .../layers/attention/flashinfer_backend.py | 20 ++++++++-------- .../attention/flashinfer_mla_backend.py | 4 ++-- python/sglang/srt/managers/schedule_batch.py | 8 +++---- python/sglang/srt/managers/scheduler.py | 16 ++++++------- 6 files changed, 34 insertions(+), 46 deletions(-) diff --git a/python/sglang/global_config.py b/python/sglang/global_config.py index 383bafe0d..552351466 100644 --- a/python/sglang/global_config.py +++ b/python/sglang/global_config.py @@ -1,6 +1,6 @@ """Global configurations""" -import os +# FIXME: deprecate this file and move all usage to sglang.srt.environ or sglang.__init__.py class GlobalConfig: @@ -20,27 +20,6 @@ class GlobalConfig: # Default backend of the language self.default_backend = None - # Runtime constants: New generation token ratio estimation - self.default_init_new_token_ratio = float( - os.environ.get("SGLANG_INIT_NEW_TOKEN_RATIO", 0.7) - ) - self.default_min_new_token_ratio_factor = float( - os.environ.get("SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR", 0.14) - ) - self.default_new_token_ratio_decay_steps = float( - os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600) - ) - self.torch_empty_cache_interval = float( - os.environ.get( - "SGLANG_EMPTY_CACHE_INTERVAL", -1 - ) # in seconds. Set if you observe high memory accumulation over a long serving period. - ) - # Runtime constants: others - self.retract_decode_steps = 20 - self.flashinfer_workspace_size = int( - os.environ.get("FLASHINFER_WORKSPACE_SIZE", 384 * 1024 * 1024) - ) - # Output tokenization configs self.skip_special_tokens_in_output = True self.spaces_between_special_tokens_in_out = True diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py index aa9814df9..f3bb8c005 100644 --- a/python/sglang/srt/environ.py +++ b/python/sglang/srt/environ.py @@ -128,6 +128,14 @@ class Envs: SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial") SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp") + # Scheduler: new token ratio hyperparameters + SGLANG_INIT_NEW_TOKEN_RATIO = EnvFloat(0.7) + SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR = EnvFloat(0.14) + SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS = EnvInt(600) + SGLANG_RETRACT_DECODE_STEPS = EnvInt(20) + + # Scheduler: others: + SGLANG_EMPTY_CACHE_INTERVAL = EnvFloat(-1) # in seconds. Set if you observe high memory accumulation over a long serving period. # Test: pd-disaggregation SGLANG_TEST_PD_DISAGG_BACKEND = EnvStr("mooncake") SGLANG_TEST_PD_DISAGG_DEVICES = EnvStr(None) @@ -159,6 +167,7 @@ class Envs: # Flashinfer SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True) SGLANG_ENABLE_FLASHINFER_GEMM = EnvBool(False) + SGLANG_FLASHINFER_WORKSPACE_SIZE = EnvInt(384 * 1024 * 1024) # Triton SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False) diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index 3d551efee..7cae8e59d 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -16,13 +16,7 @@ from typing import TYPE_CHECKING, Callable, List, Optional, Union import torch -if os.environ["SGLANG_ENABLE_TORCH_COMPILE"] == "1": - torch._logging.set_logs(dynamo=logging.ERROR) - torch._dynamo.config.suppress_errors = True - -logger = logging.getLogger(__name__) - -from sglang.global_config import global_config +from sglang.srt.environ import envs from sglang.srt.layers.attention.base_attn_backend import AttentionBackend from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton from sglang.srt.layers.dp_attention import get_attention_tp_size @@ -41,6 +35,12 @@ if TYPE_CHECKING: from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.model_runner import ModelRunner +logger = logging.getLogger(__name__) + +if envs.SGLANG_ENABLE_TORCH_COMPILE.get(): + torch._logging.set_logs(dynamo=logging.ERROR) + torch._dynamo.config.suppress_errors = True + if is_flashinfer_available(): from flashinfer import ( @@ -160,7 +160,7 @@ class FlashInferAttnBackend(AttentionBackend): or "Qwen3ForCausalLM" in model_runner.model_config.hf_config.architectures or "MiMoForCausalLM" in model_runner.model_config.hf_config.architectures ): - global_config.flashinfer_workspace_size = 512 * 1024 * 1024 + envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.set(512 * 1024 * 1024) # When deterministic inference is enabled, tensor cores should be used for decode # Also set split tile sizes for prefill and decode from environment variables, and disable kv split for cuda graph @@ -180,13 +180,13 @@ class FlashInferAttnBackend(AttentionBackend): "SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE", 2048 ) self.disable_cuda_graph_kv_split = True - global_config.flashinfer_workspace_size = 2048 * 1024 * 1024 + envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.set(2048 * 1024 * 1024) # Allocate buffers global global_workspace_buffer if global_workspace_buffer is None: # different from flashinfer zero_init_global_workspace_buffer - global_workspace_size = global_config.flashinfer_workspace_size + global_workspace_size = envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.get() global_workspace_buffer = torch.empty( global_workspace_size, dtype=torch.uint8, diff --git a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py index 21e6772bf..6efda7775 100644 --- a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py @@ -22,7 +22,7 @@ if os.environ["SGLANG_ENABLE_TORCH_COMPILE"] == "1": torch._logging.set_logs(dynamo=logging.ERROR) torch._dynamo.config.suppress_errors = True -from sglang.global_config import global_config +from sglang.srt.environ import envs from sglang.srt.layers.attention.base_attn_backend import AttentionBackend from sglang.srt.layers.attention.flashinfer_backend import ( create_flashinfer_kv_indices_triton, @@ -204,7 +204,7 @@ class FlashInferMLAAttnBackend(AttentionBackend): if global_workspace_buffer is None: # different from flashinfer zero_init_global_workspace_buffer global_workspace_buffer = torch.empty( - global_config.flashinfer_workspace_size, + envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.get(), dtype=torch.uint8, device=model_runner.device, ) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index fec93f66f..f0b03638f 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -37,7 +37,6 @@ import copy import dataclasses import logging import re -import threading import time from enum import Enum, auto from http import HTTPStatus @@ -47,7 +46,6 @@ from typing import TYPE_CHECKING, Any, List, Optional, Set, Tuple, Union import numpy as np import torch -from sglang.global_config import global_config from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject from sglang.srt.disaggregation.base import BaseKVSender from sglang.srt.disaggregation.decode_schedule_batch_mixin import ( @@ -55,6 +53,7 @@ from sglang.srt.disaggregation.decode_schedule_batch_mixin import ( ) from sglang.srt.disaggregation.utils import DisaggregationMode from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank +from sglang.srt.environ import envs from sglang.srt.mem_cache.allocator import ( BaseTokenToKVPoolAllocator, SWATokenToKVPoolAllocator, @@ -1481,7 +1480,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin): total_max_new_tokens = sum(r.sampling_params.max_new_tokens for r in self.reqs) new_estimate_ratio = ( - total_decoded_tokens + global_config.retract_decode_steps * len(self.reqs) + total_decoded_tokens + + envs.SGLANG_RETRACT_DECODE_STEPS.get() * len(self.reqs) ) / total_max_new_tokens new_estimate_ratio = min(1.0, new_estimate_ratio) @@ -1520,7 +1520,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin): self.tree_cache.dec_lock_ref(req.last_node) # NOTE(lsyin): we should use the newly evictable memory instantly. - num_tokens = remaing_req_count * global_config.retract_decode_steps + num_tokens = remaing_req_count * envs.SGLANG_RETRACT_DECODE_STEPS.get() self._evict_tree_cache_if_needed(num_tokens) req.reset_for_retract() diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 5a7555a86..ea7b8222b 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -35,7 +35,6 @@ from torch.cuda import Stream as CudaStream from torch.cuda import StreamContext as CudaStreamContext from torch.distributed import barrier -from sglang.global_config import global_config from sglang.srt.configs.model_config import ModelConfig from sglang.srt.constrained.base_grammar_backend import ( INVALID_GRAMMAR_OBJ, @@ -61,6 +60,7 @@ from sglang.srt.disaggregation.utils import ( prepare_abort, ) from sglang.srt.distributed import get_pp_group, get_world_group +from sglang.srt.environ import envs from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder from sglang.srt.layers.dp_attention import compute_dp_attention_world_info from sglang.srt.layers.logits_processor import LogitsProcessorOutput @@ -556,18 +556,17 @@ class Scheduler( server_args.schedule_conservativeness >= 0 ), "Invalid schedule_conservativeness" self.init_new_token_ratio = min( - global_config.default_init_new_token_ratio + envs.SGLANG_INIT_NEW_TOKEN_RATIO.get() * server_args.schedule_conservativeness, 1.0, ) self.min_new_token_ratio = min( - self.init_new_token_ratio - * global_config.default_min_new_token_ratio_factor, + self.init_new_token_ratio * envs.SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR.get(), 1.0, ) self.new_token_ratio_decay = ( self.init_new_token_ratio - self.min_new_token_ratio - ) / global_config.default_new_token_ratio_decay_steps + ) / envs.SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS.get() self.new_token_ratio = self.init_new_token_ratio # Init watchdog thread @@ -2897,12 +2896,13 @@ class IdleSleeper: for s in sockets: self.poller.register(s, zmq.POLLIN) + self.empty_cache_interval = envs.SGLANG_EMPTY_CACHE_INTERVAL.get() + def maybe_sleep(self): self.poller.poll(1000) if ( - global_config.torch_empty_cache_interval > 0 - and time.time() - self.last_empty_time - > global_config.torch_empty_cache_interval + self.empty_cache_interval > 0 + and time.time() - self.last_empty_time > self.empty_cache_interval ): self.last_empty_time = time.time() torch.cuda.empty_cache()