Move args from global_config to environ (#11332)

This commit is contained in:
Liangsheng Yin
2025-10-12 21:29:31 +08:00
committed by GitHub
parent 01e59e8247
commit f49419061d
6 changed files with 34 additions and 46 deletions

View File

@@ -16,13 +16,7 @@ from typing import TYPE_CHECKING, Callable, List, Optional, Union
import torch
if os.environ["SGLANG_ENABLE_TORCH_COMPILE"] == "1":
torch._logging.set_logs(dynamo=logging.ERROR)
torch._dynamo.config.suppress_errors = True
logger = logging.getLogger(__name__)
from sglang.global_config import global_config
from sglang.srt.environ import envs
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
from sglang.srt.layers.dp_attention import get_attention_tp_size
@@ -41,6 +35,12 @@ if TYPE_CHECKING:
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.model_executor.model_runner import ModelRunner
logger = logging.getLogger(__name__)
if envs.SGLANG_ENABLE_TORCH_COMPILE.get():
torch._logging.set_logs(dynamo=logging.ERROR)
torch._dynamo.config.suppress_errors = True
if is_flashinfer_available():
from flashinfer import (
@@ -160,7 +160,7 @@ class FlashInferAttnBackend(AttentionBackend):
or "Qwen3ForCausalLM" in model_runner.model_config.hf_config.architectures
or "MiMoForCausalLM" in model_runner.model_config.hf_config.architectures
):
global_config.flashinfer_workspace_size = 512 * 1024 * 1024
envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.set(512 * 1024 * 1024)
# When deterministic inference is enabled, tensor cores should be used for decode
# Also set split tile sizes for prefill and decode from environment variables, and disable kv split for cuda graph
@@ -180,13 +180,13 @@ class FlashInferAttnBackend(AttentionBackend):
"SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE", 2048
)
self.disable_cuda_graph_kv_split = True
global_config.flashinfer_workspace_size = 2048 * 1024 * 1024
envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.set(2048 * 1024 * 1024)
# Allocate buffers
global global_workspace_buffer
if global_workspace_buffer is None:
# different from flashinfer zero_init_global_workspace_buffer
global_workspace_size = global_config.flashinfer_workspace_size
global_workspace_size = envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.get()
global_workspace_buffer = torch.empty(
global_workspace_size,
dtype=torch.uint8,

View File

@@ -22,7 +22,7 @@ if os.environ["SGLANG_ENABLE_TORCH_COMPILE"] == "1":
torch._logging.set_logs(dynamo=logging.ERROR)
torch._dynamo.config.suppress_errors = True
from sglang.global_config import global_config
from sglang.srt.environ import envs
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.layers.attention.flashinfer_backend import (
create_flashinfer_kv_indices_triton,
@@ -204,7 +204,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
if global_workspace_buffer is None:
# different from flashinfer zero_init_global_workspace_buffer
global_workspace_buffer = torch.empty(
global_config.flashinfer_workspace_size,
envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.get(),
dtype=torch.uint8,
device=model_runner.device,
)