Fix mem fraction static for nightly tests (#11076)

2025-09-29 12:57:41 -07:00
parent 4eeaff74a0
commit dda34c2f93
8 changed files with 24 additions and 22 deletions
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -67,7 +67,7 @@ from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
 from sglang.srt.metrics.collector import SchedulerMetricsCollector, TimeStats
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
-from sglang.srt.sampling.sampling_params import DEFAULT_SAMPLING_SEED, SamplingParams
+from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import flatten_nested_list, support_triton

--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -1482,7 +1482,8 @@ class ModelRunner:

        if self.max_total_num_tokens <= 0:
            raise RuntimeError(
-                "Not enough memory. Please try to increase --mem-fraction-static."
+                f"Not enough memory. Please try to increase --mem-fraction-static. "
+                f"Current value: {self.server_args.mem_fraction_static=}"
            )

        # Initialize req_to_token_pool
--- a/python/sglang/srt/sampling/sampling_params.py
+++ b/python/sglang/srt/sampling/sampling_params.py
@@ -19,7 +19,6 @@ from sglang.srt.utils import get_bool_env_var

 _SAMPLING_EPS = 1e-6
 TOP_K_ALL = 1 << 30
-DEFAULT_SAMPLING_SEED = 42


 class SamplingParams:
@@ -56,7 +55,7 @@ class SamplingParams:
        custom_params: Optional[Dict[str, Any]] = None,
        stream_interval: Optional[int] = None,
        logit_bias: Optional[Dict[str, float]] = None,
-        sampling_seed: Optional[int] = None,
+        sampling_seed: int = 42,
    ) -> None:
        self.max_new_tokens = max_new_tokens
        self.stop_strs = stop
@@ -84,13 +83,6 @@ class SamplingParams:
        self.custom_params = custom_params
        self.stream_interval = stream_interval
        self.logit_bias = logit_bias
-        # Used for deterministic sampling
-        if (
-            get_bool_env_var("SGLANG_ENABLE_DETERMINISTIC_INFERENCE")
-            and sampling_seed is None
-        ):
-            # If deterministic inference is enabled and sampling_seed is not set, use the default seed
-            sampling_seed = DEFAULT_SAMPLING_SEED
        self.sampling_seed = sampling_seed

        # Process some special cases
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -618,7 +618,7 @@ class ServerArgs:

        if self.mem_fraction_static is None:
            # Constant meta data (e.g., from attention backend)
-            reserved_mem = 1024
+            reserved_mem = 512
            # For activation during large prefill
            if self.chunked_prefill_size > 0:
                reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
@@ -627,7 +627,7 @@ class ServerArgs:
            # For cuda graphs
            reserved_mem += self.cuda_graph_max_bs * 2
            # Some adjustments for large parallel size
-            reserved_mem += self.tp_size * self.pp_size / 4 * 1024
+            reserved_mem += self.tp_size * self.pp_size / 8 * 1024

            if self.enable_dp_attention:
                # DP attention needs more padding for some operations