Fix mem fraction static for nightly tests (#11076)
This commit is contained in:
@@ -67,7 +67,7 @@ from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
|
||||
from sglang.srt.metrics.collector import SchedulerMetricsCollector, TimeStats
|
||||
from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
|
||||
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
|
||||
from sglang.srt.sampling.sampling_params import DEFAULT_SAMPLING_SEED, SamplingParams
|
||||
from sglang.srt.sampling.sampling_params import SamplingParams
|
||||
from sglang.srt.server_args import ServerArgs
|
||||
from sglang.srt.utils import flatten_nested_list, support_triton
|
||||
|
||||
|
||||
@@ -1482,7 +1482,8 @@ class ModelRunner:
|
||||
|
||||
if self.max_total_num_tokens <= 0:
|
||||
raise RuntimeError(
|
||||
"Not enough memory. Please try to increase --mem-fraction-static."
|
||||
f"Not enough memory. Please try to increase --mem-fraction-static. "
|
||||
f"Current value: {self.server_args.mem_fraction_static=}"
|
||||
)
|
||||
|
||||
# Initialize req_to_token_pool
|
||||
|
||||
@@ -19,7 +19,6 @@ from sglang.srt.utils import get_bool_env_var
|
||||
|
||||
_SAMPLING_EPS = 1e-6
|
||||
TOP_K_ALL = 1 << 30
|
||||
DEFAULT_SAMPLING_SEED = 42
|
||||
|
||||
|
||||
class SamplingParams:
|
||||
@@ -56,7 +55,7 @@ class SamplingParams:
|
||||
custom_params: Optional[Dict[str, Any]] = None,
|
||||
stream_interval: Optional[int] = None,
|
||||
logit_bias: Optional[Dict[str, float]] = None,
|
||||
sampling_seed: Optional[int] = None,
|
||||
sampling_seed: int = 42,
|
||||
) -> None:
|
||||
self.max_new_tokens = max_new_tokens
|
||||
self.stop_strs = stop
|
||||
@@ -84,13 +83,6 @@ class SamplingParams:
|
||||
self.custom_params = custom_params
|
||||
self.stream_interval = stream_interval
|
||||
self.logit_bias = logit_bias
|
||||
# Used for deterministic sampling
|
||||
if (
|
||||
get_bool_env_var("SGLANG_ENABLE_DETERMINISTIC_INFERENCE")
|
||||
and sampling_seed is None
|
||||
):
|
||||
# If deterministic inference is enabled and sampling_seed is not set, use the default seed
|
||||
sampling_seed = DEFAULT_SAMPLING_SEED
|
||||
self.sampling_seed = sampling_seed
|
||||
|
||||
# Process some special cases
|
||||
|
||||
@@ -618,7 +618,7 @@ class ServerArgs:
|
||||
|
||||
if self.mem_fraction_static is None:
|
||||
# Constant meta data (e.g., from attention backend)
|
||||
reserved_mem = 1024
|
||||
reserved_mem = 512
|
||||
# For activation during large prefill
|
||||
if self.chunked_prefill_size > 0:
|
||||
reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
|
||||
@@ -627,7 +627,7 @@ class ServerArgs:
|
||||
# For cuda graphs
|
||||
reserved_mem += self.cuda_graph_max_bs * 2
|
||||
# Some adjustments for large parallel size
|
||||
reserved_mem += self.tp_size * self.pp_size / 4 * 1024
|
||||
reserved_mem += self.tp_size * self.pp_size / 8 * 1024
|
||||
|
||||
if self.enable_dp_attention:
|
||||
# DP attention needs more padding for some operations
|
||||
|
||||
Reference in New Issue
Block a user