Deprecate global_server_args_dict (#11331)

2025-10-13 01:20:47 +08:00
parent 2157d12ae8
commit 1083e7e3df
54 changed files with 240 additions and 321 deletions
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -72,7 +72,7 @@ from sglang.srt.metrics.collector import SchedulerMetricsCollector, TimeStats
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.sampling.sampling_params import SamplingParams
-from sglang.srt.server_args import ServerArgs
+from sglang.srt.server_args import ServerArgs, get_global_server_args
 from sglang.srt.utils import flatten_nested_list
 from sglang.srt.utils.common import next_power_of_2

@@ -82,47 +82,6 @@ if TYPE_CHECKING:

 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5

-GLOBAL_SERVER_ARGS_KEYS = [
-    "attention_backend",
-    "mm_attention_backend",
-    "debug_tensor_dump_inject",
-    "debug_tensor_dump_output_folder",
-    "chunked_prefill_size",
-    "device",
-    "disable_chunked_prefix_cache",
-    "disable_flashinfer_cutlass_moe_fp4_allgather",
-    "disable_radix_cache",
-    "enable_dp_lm_head",
-    "enable_fp32_lm_head",
-    "flashinfer_mxfp4_moe_precision",
-    "enable_flashinfer_allreduce_fusion",
-    "moe_dense_tp_size",
-    "ep_dispatch_algorithm",
-    "ep_num_redundant_experts",
-    "enable_nan_detection",
-    "flashinfer_mla_disable_ragged",
-    "pp_max_micro_batch_size",
-    "disable_shared_experts_fusion",
-    "sampling_backend",
-    "speculative_accept_threshold_single",
-    "speculative_accept_threshold_acc",
-    "speculative_attention_mode",
-    "torchao_config",
-    "triton_attention_reduce_in_fp32",
-    "num_reserved_decode_tokens",
-    "weight_loader_disable_mmap",
-    "enable_multimodal",
-    "enable_symm_mem",
-    "enable_custom_logit_processor",
-    "disaggregation_mode",
-    "enable_deterministic_inference",
-    "nsa_prefill",
-    "nsa_decode",
-    "multi_item_scoring_delimiter",
-]
-
-# Put some global args for easy access
-global_server_args_dict = {k: getattr(ServerArgs, k) for k in GLOBAL_SERVER_ARGS_KEYS}

 logger = logging.getLogger(__name__)

@@ -683,12 +642,9 @@ class Req:
    def is_prefill_only(self) -> bool:
        """Check if this request is prefill-only (no token generation needed)."""
        # NOTE: when spec is enabled, prefill_only optimizations are disabled
-        from sglang.srt.speculative.spec_info import SpeculativeAlgorithm

-        spec_alg = global_server_args_dict["speculative_algorithm"]
-        return self.sampling_params.max_new_tokens == 0 and (
-            spec_alg is None or spec_alg == SpeculativeAlgorithm.NONE
-        )
+        spec_alg = get_global_server_args().speculative_algorithm
+        return self.sampling_params.max_new_tokens == 0 and spec_alg is None

    def add_latency(self, stage: RequestStage):
        if self.metrics_collector is None: