Move args from global_config to environ (#11332)
This commit is contained in:
@@ -16,13 +16,7 @@ from typing import TYPE_CHECKING, Callable, List, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
if os.environ["SGLANG_ENABLE_TORCH_COMPILE"] == "1":
|
||||
torch._logging.set_logs(dynamo=logging.ERROR)
|
||||
torch._dynamo.config.suppress_errors = True
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from sglang.global_config import global_config
|
||||
from sglang.srt.environ import envs
|
||||
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
||||
from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
|
||||
from sglang.srt.layers.dp_attention import get_attention_tp_size
|
||||
@@ -41,6 +35,12 @@ if TYPE_CHECKING:
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.model_executor.model_runner import ModelRunner
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if envs.SGLANG_ENABLE_TORCH_COMPILE.get():
|
||||
torch._logging.set_logs(dynamo=logging.ERROR)
|
||||
torch._dynamo.config.suppress_errors = True
|
||||
|
||||
|
||||
if is_flashinfer_available():
|
||||
from flashinfer import (
|
||||
@@ -160,7 +160,7 @@ class FlashInferAttnBackend(AttentionBackend):
|
||||
or "Qwen3ForCausalLM" in model_runner.model_config.hf_config.architectures
|
||||
or "MiMoForCausalLM" in model_runner.model_config.hf_config.architectures
|
||||
):
|
||||
global_config.flashinfer_workspace_size = 512 * 1024 * 1024
|
||||
envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.set(512 * 1024 * 1024)
|
||||
|
||||
# When deterministic inference is enabled, tensor cores should be used for decode
|
||||
# Also set split tile sizes for prefill and decode from environment variables, and disable kv split for cuda graph
|
||||
@@ -180,13 +180,13 @@ class FlashInferAttnBackend(AttentionBackend):
|
||||
"SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE", 2048
|
||||
)
|
||||
self.disable_cuda_graph_kv_split = True
|
||||
global_config.flashinfer_workspace_size = 2048 * 1024 * 1024
|
||||
envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.set(2048 * 1024 * 1024)
|
||||
|
||||
# Allocate buffers
|
||||
global global_workspace_buffer
|
||||
if global_workspace_buffer is None:
|
||||
# different from flashinfer zero_init_global_workspace_buffer
|
||||
global_workspace_size = global_config.flashinfer_workspace_size
|
||||
global_workspace_size = envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.get()
|
||||
global_workspace_buffer = torch.empty(
|
||||
global_workspace_size,
|
||||
dtype=torch.uint8,
|
||||
|
||||
@@ -22,7 +22,7 @@ if os.environ["SGLANG_ENABLE_TORCH_COMPILE"] == "1":
|
||||
torch._logging.set_logs(dynamo=logging.ERROR)
|
||||
torch._dynamo.config.suppress_errors = True
|
||||
|
||||
from sglang.global_config import global_config
|
||||
from sglang.srt.environ import envs
|
||||
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
||||
from sglang.srt.layers.attention.flashinfer_backend import (
|
||||
create_flashinfer_kv_indices_triton,
|
||||
@@ -204,7 +204,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
|
||||
if global_workspace_buffer is None:
|
||||
# different from flashinfer zero_init_global_workspace_buffer
|
||||
global_workspace_buffer = torch.empty(
|
||||
global_config.flashinfer_workspace_size,
|
||||
envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.get(),
|
||||
dtype=torch.uint8,
|
||||
device=model_runner.device,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user