From f5599ef124215b644aab51117bd23b05ccb67826 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Sat, 7 Jun 2025 18:10:35 +0800 Subject: [PATCH] Refactor global_server_args_dict (#6866) --- python/sglang/srt/managers/schedule_batch.py | 57 ++++++++++--------- .../sglang/srt/model_executor/model_runner.py | 34 +++-------- 2 files changed, 38 insertions(+), 53 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 38ec54856..74191ae5b 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -70,33 +70,38 @@ if TYPE_CHECKING: INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5 +GLOBAL_SERVER_ARGS_KEYS = [ + "attention_backend", + "debug_tensor_dump_inject", + "debug_tensor_dump_output_folder", + "chunked_prefill_size", + "deepep_mode", + "device", + "disable_chunked_prefix_cache", + "disable_radix_cache", + "enable_deepep_moe", + "enable_dp_attention", + "enable_two_batch_overlap", + "enable_dp_lm_head", + "enable_ep_moe", + "deepep_config", + "enable_nan_detection", + "flashinfer_mla_disable_ragged", + "max_micro_batch_size", + "moe_dense_tp_size", + "ep_dispatch_algorithm", + "disable_shared_experts_fusion", + "sampling_backend", + "speculative_accept_threshold_acc", + "speculative_accept_threshold_single", + "torchao_config", + "triton_attention_reduce_in_fp32", + "ep_num_redundant_experts", + "mm_attention_backend", +] + # Put some global args for easy access -global_server_args_dict = { - "attention_backend": ServerArgs.attention_backend, - "chunked_prefill_size": ServerArgs.chunked_prefill_size, - "deepep_mode": ServerArgs.deepep_mode, - "device": ServerArgs.device, - "disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache, - "disable_radix_cache": ServerArgs.disable_radix_cache, - "enable_deepep_moe": ServerArgs.enable_deepep_moe, - "enable_dp_attention": ServerArgs.enable_dp_attention, - "enable_two_batch_overlap": ServerArgs.enable_two_batch_overlap, - "enable_dp_lm_head": ServerArgs.enable_dp_lm_head, - "enable_ep_moe": ServerArgs.enable_ep_moe, - "deepep_config": ServerArgs.deepep_config, - "enable_nan_detection": ServerArgs.enable_nan_detection, - "flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged, - "max_micro_batch_size": ServerArgs.max_micro_batch_size, - "moe_dense_tp_size": ServerArgs.moe_dense_tp_size, - "ep_dispatch_algorithm": ServerArgs.ep_dispatch_algorithm, - "disable_shared_experts_fusion": ServerArgs.disable_shared_experts_fusion, - "sampling_backend": ServerArgs.sampling_backend, - "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc, - "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single, - "torchao_config": ServerArgs.torchao_config, - "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32, - "ep_num_redundant_experts": ServerArgs.ep_num_redundant_experts, -} +global_server_args_dict = {k: getattr(ServerArgs, k) for k in GLOBAL_SERVER_ARGS_KEYS} logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 990335200..f15e68b43 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -65,7 +65,10 @@ from sglang.srt.managers.expert_location import ( get_global_expert_location_metadata, set_global_expert_location_metadata, ) -from sglang.srt.managers.schedule_batch import global_server_args_dict +from sglang.srt.managers.schedule_batch import ( + GLOBAL_SERVER_ARGS_KEYS, + global_server_args_dict, +) from sglang.srt.mem_cache.memory_pool import ( DoubleSparseTokenToKVPool, MHATokenToKVPool, @@ -187,33 +190,10 @@ class ModelRunner: # Global vars global_server_args_dict.update( - { - "attention_backend": server_args.attention_backend, - "debug_tensor_dump_inject": server_args.debug_tensor_dump_inject, - "debug_tensor_dump_output_folder": server_args.debug_tensor_dump_output_folder, - "deepep_mode": server_args.deepep_mode, - "device": server_args.device, - "disable_chunked_prefix_cache": server_args.disable_chunked_prefix_cache, - "disable_radix_cache": server_args.disable_radix_cache, - "enable_nan_detection": server_args.enable_nan_detection, - "enable_dp_attention": server_args.enable_dp_attention, - "enable_two_batch_overlap": server_args.enable_two_batch_overlap, - "enable_dp_lm_head": server_args.enable_dp_lm_head, - "enable_ep_moe": server_args.enable_ep_moe, - "enable_deepep_moe": server_args.enable_deepep_moe, - "deepep_config": server_args.deepep_config, - "flashinfer_mla_disable_ragged": server_args.flashinfer_mla_disable_ragged, - "moe_dense_tp_size": server_args.moe_dense_tp_size, - "ep_dispatch_algorithm": server_args.ep_dispatch_algorithm, - "disable_shared_experts_fusion": server_args.disable_shared_experts_fusion, - "triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32, - "torchao_config": server_args.torchao_config, - "sampling_backend": server_args.sampling_backend, - "speculative_accept_threshold_single": server_args.speculative_accept_threshold_single, - "speculative_accept_threshold_acc": server_args.speculative_accept_threshold_acc, + {k: getattr(server_args, k) for k in GLOBAL_SERVER_ARGS_KEYS} + | { + # TODO it is indeed not a "server args" "use_mla_backend": self.use_mla_backend, - "mm_attention_backend": server_args.mm_attention_backend, - "ep_num_redundant_experts": server_args.ep_num_redundant_experts, } )