Fix mem fraction static for nightly tests (#11076)

This commit is contained in:
Lianmin Zheng
2025-09-29 12:57:41 -07:00
committed by GitHub
parent 4eeaff74a0
commit dda34c2f93
8 changed files with 24 additions and 22 deletions

View File

@@ -67,7 +67,7 @@ from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
from sglang.srt.metrics.collector import SchedulerMetricsCollector, TimeStats
from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
from sglang.srt.sampling.sampling_params import DEFAULT_SAMPLING_SEED, SamplingParams
from sglang.srt.sampling.sampling_params import SamplingParams
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import flatten_nested_list, support_triton

View File

@@ -1482,7 +1482,8 @@ class ModelRunner:
if self.max_total_num_tokens <= 0:
raise RuntimeError(
"Not enough memory. Please try to increase --mem-fraction-static."
f"Not enough memory. Please try to increase --mem-fraction-static. "
f"Current value: {self.server_args.mem_fraction_static=}"
)
# Initialize req_to_token_pool

View File

@@ -19,7 +19,6 @@ from sglang.srt.utils import get_bool_env_var
_SAMPLING_EPS = 1e-6
TOP_K_ALL = 1 << 30
DEFAULT_SAMPLING_SEED = 42
class SamplingParams:
@@ -56,7 +55,7 @@ class SamplingParams:
custom_params: Optional[Dict[str, Any]] = None,
stream_interval: Optional[int] = None,
logit_bias: Optional[Dict[str, float]] = None,
sampling_seed: Optional[int] = None,
sampling_seed: int = 42,
) -> None:
self.max_new_tokens = max_new_tokens
self.stop_strs = stop
@@ -84,13 +83,6 @@ class SamplingParams:
self.custom_params = custom_params
self.stream_interval = stream_interval
self.logit_bias = logit_bias
# Used for deterministic sampling
if (
get_bool_env_var("SGLANG_ENABLE_DETERMINISTIC_INFERENCE")
and sampling_seed is None
):
# If deterministic inference is enabled and sampling_seed is not set, use the default seed
sampling_seed = DEFAULT_SAMPLING_SEED
self.sampling_seed = sampling_seed
# Process some special cases

View File

@@ -618,7 +618,7 @@ class ServerArgs:
if self.mem_fraction_static is None:
# Constant meta data (e.g., from attention backend)
reserved_mem = 1024
reserved_mem = 512
# For activation during large prefill
if self.chunked_prefill_size > 0:
reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
@@ -627,7 +627,7 @@ class ServerArgs:
# For cuda graphs
reserved_mem += self.cuda_graph_max_bs * 2
# Some adjustments for large parallel size
reserved_mem += self.tp_size * self.pp_size / 4 * 1024
reserved_mem += self.tp_size * self.pp_size / 8 * 1024
if self.enable_dp_attention:
# DP attention needs more padding for some operations