Refactor global_server_args_dict (#6866)
This commit is contained in:
@@ -70,33 +70,38 @@ if TYPE_CHECKING:
|
||||
|
||||
INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
|
||||
|
||||
GLOBAL_SERVER_ARGS_KEYS = [
|
||||
"attention_backend",
|
||||
"debug_tensor_dump_inject",
|
||||
"debug_tensor_dump_output_folder",
|
||||
"chunked_prefill_size",
|
||||
"deepep_mode",
|
||||
"device",
|
||||
"disable_chunked_prefix_cache",
|
||||
"disable_radix_cache",
|
||||
"enable_deepep_moe",
|
||||
"enable_dp_attention",
|
||||
"enable_two_batch_overlap",
|
||||
"enable_dp_lm_head",
|
||||
"enable_ep_moe",
|
||||
"deepep_config",
|
||||
"enable_nan_detection",
|
||||
"flashinfer_mla_disable_ragged",
|
||||
"max_micro_batch_size",
|
||||
"moe_dense_tp_size",
|
||||
"ep_dispatch_algorithm",
|
||||
"disable_shared_experts_fusion",
|
||||
"sampling_backend",
|
||||
"speculative_accept_threshold_acc",
|
||||
"speculative_accept_threshold_single",
|
||||
"torchao_config",
|
||||
"triton_attention_reduce_in_fp32",
|
||||
"ep_num_redundant_experts",
|
||||
"mm_attention_backend",
|
||||
]
|
||||
|
||||
# Put some global args for easy access
|
||||
global_server_args_dict = {
|
||||
"attention_backend": ServerArgs.attention_backend,
|
||||
"chunked_prefill_size": ServerArgs.chunked_prefill_size,
|
||||
"deepep_mode": ServerArgs.deepep_mode,
|
||||
"device": ServerArgs.device,
|
||||
"disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
|
||||
"disable_radix_cache": ServerArgs.disable_radix_cache,
|
||||
"enable_deepep_moe": ServerArgs.enable_deepep_moe,
|
||||
"enable_dp_attention": ServerArgs.enable_dp_attention,
|
||||
"enable_two_batch_overlap": ServerArgs.enable_two_batch_overlap,
|
||||
"enable_dp_lm_head": ServerArgs.enable_dp_lm_head,
|
||||
"enable_ep_moe": ServerArgs.enable_ep_moe,
|
||||
"deepep_config": ServerArgs.deepep_config,
|
||||
"enable_nan_detection": ServerArgs.enable_nan_detection,
|
||||
"flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
|
||||
"max_micro_batch_size": ServerArgs.max_micro_batch_size,
|
||||
"moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
|
||||
"ep_dispatch_algorithm": ServerArgs.ep_dispatch_algorithm,
|
||||
"disable_shared_experts_fusion": ServerArgs.disable_shared_experts_fusion,
|
||||
"sampling_backend": ServerArgs.sampling_backend,
|
||||
"speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
|
||||
"speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
|
||||
"torchao_config": ServerArgs.torchao_config,
|
||||
"triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
|
||||
"ep_num_redundant_experts": ServerArgs.ep_num_redundant_experts,
|
||||
}
|
||||
global_server_args_dict = {k: getattr(ServerArgs, k) for k in GLOBAL_SERVER_ARGS_KEYS}
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -65,7 +65,10 @@ from sglang.srt.managers.expert_location import (
|
||||
get_global_expert_location_metadata,
|
||||
set_global_expert_location_metadata,
|
||||
)
|
||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||
from sglang.srt.managers.schedule_batch import (
|
||||
GLOBAL_SERVER_ARGS_KEYS,
|
||||
global_server_args_dict,
|
||||
)
|
||||
from sglang.srt.mem_cache.memory_pool import (
|
||||
DoubleSparseTokenToKVPool,
|
||||
MHATokenToKVPool,
|
||||
@@ -187,33 +190,10 @@ class ModelRunner:
|
||||
|
||||
# Global vars
|
||||
global_server_args_dict.update(
|
||||
{
|
||||
"attention_backend": server_args.attention_backend,
|
||||
"debug_tensor_dump_inject": server_args.debug_tensor_dump_inject,
|
||||
"debug_tensor_dump_output_folder": server_args.debug_tensor_dump_output_folder,
|
||||
"deepep_mode": server_args.deepep_mode,
|
||||
"device": server_args.device,
|
||||
"disable_chunked_prefix_cache": server_args.disable_chunked_prefix_cache,
|
||||
"disable_radix_cache": server_args.disable_radix_cache,
|
||||
"enable_nan_detection": server_args.enable_nan_detection,
|
||||
"enable_dp_attention": server_args.enable_dp_attention,
|
||||
"enable_two_batch_overlap": server_args.enable_two_batch_overlap,
|
||||
"enable_dp_lm_head": server_args.enable_dp_lm_head,
|
||||
"enable_ep_moe": server_args.enable_ep_moe,
|
||||
"enable_deepep_moe": server_args.enable_deepep_moe,
|
||||
"deepep_config": server_args.deepep_config,
|
||||
"flashinfer_mla_disable_ragged": server_args.flashinfer_mla_disable_ragged,
|
||||
"moe_dense_tp_size": server_args.moe_dense_tp_size,
|
||||
"ep_dispatch_algorithm": server_args.ep_dispatch_algorithm,
|
||||
"disable_shared_experts_fusion": server_args.disable_shared_experts_fusion,
|
||||
"triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
|
||||
"torchao_config": server_args.torchao_config,
|
||||
"sampling_backend": server_args.sampling_backend,
|
||||
"speculative_accept_threshold_single": server_args.speculative_accept_threshold_single,
|
||||
"speculative_accept_threshold_acc": server_args.speculative_accept_threshold_acc,
|
||||
{k: getattr(server_args, k) for k in GLOBAL_SERVER_ARGS_KEYS}
|
||||
| {
|
||||
# TODO it is indeed not a "server args"
|
||||
"use_mla_backend": self.use_mla_backend,
|
||||
"mm_attention_backend": server_args.mm_attention_backend,
|
||||
"ep_num_redundant_experts": server_args.ep_num_redundant_experts,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user