Depreate global_server_args_dict (#11528)

This commit is contained in:
Liangsheng Yin
2025-10-13 19:34:43 +08:00
committed by GitHub
parent 0b6f535f66
commit 516738b096
54 changed files with 240 additions and 321 deletions

View File

@@ -83,10 +83,6 @@ from sglang.srt.layers.sampler import Sampler
from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
from sglang.srt.lora.lora_manager import LoRAManager
from sglang.srt.lora.lora_registry import LoRARef
from sglang.srt.managers.schedule_batch import (
GLOBAL_SERVER_ARGS_KEYS,
global_server_args_dict,
)
from sglang.srt.mem_cache.allocator import (
BaseTokenToKVPoolAllocator,
PagedTokenToKVPoolAllocator,
@@ -125,7 +121,11 @@ from sglang.srt.model_loader.remote_instance_weight_loader_utils import (
from sglang.srt.model_loader.utils import set_default_torch_dtype
from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
from sglang.srt.server_args import ServerArgs
from sglang.srt.server_args import (
ServerArgs,
get_global_server_args,
set_global_server_args_for_scheduler,
)
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
from sglang.srt.utils import (
MultiprocessingSerializer,
@@ -278,15 +278,12 @@ class ModelRunner:
# Model-specific adjustment
self.model_specific_adjustment()
# Global vars
global_server_args_dict.update(
{k: getattr(server_args, k) for k in GLOBAL_SERVER_ARGS_KEYS}
| {
# TODO it is indeed not a "server args"
"use_mla_backend": self.use_mla_backend,
"speculative_algorithm": self.spec_algorithm,
}
)
# Set the global server_args in the scheduler process
set_global_server_args_for_scheduler(server_args)
global_server_args = get_global_server_args()
# FIXME: hacky set `use_mla_backend`
global_server_args.use_mla_backend = self.use_mla_backend
# Init OpenMP threads binding for CPU
if self.device == "cpu":
@@ -419,7 +416,7 @@ class ModelRunner:
# In layered loading, torchao may have been applied
if not torchao_applied:
apply_torchao_config_to_model(
self.model, global_server_args_dict["torchao_config"]
self.model, get_global_server_args().torchao_config
)
# Apply torch TP if the model supports it
@@ -1879,12 +1876,10 @@ class ModelRunner:
self.server_args.attention_backend
)
global_server_args_dict.update(
{
"decode_attention_backend": self.decode_attention_backend_str,
"prefill_attention_backend": self.prefill_attention_backend_str,
}
)
(
get_global_server_args().prefill_attention_backend,
get_global_server_args().decode_attention_backend,
) = (self.prefill_attention_backend_str, self.decode_attention_backend_str)
return attn_backend
def _get_attention_backend_from_str(self, backend_str: str):