diff --git a/docs/advanced_features/pd_disaggregation.md b/docs/advanced_features/pd_disaggregation.md index 2ff711751..b7a384c4c 100644 --- a/docs/advanced_features/pd_disaggregation.md +++ b/docs/advanced_features/pd_disaggregation.md @@ -67,7 +67,6 @@ Please be aware that this setting will cause prefill instances to take a longer | **`SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL`** | Interval (seconds) between health checks to prefill bootstrap servers | `5.0` | | **`SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE`** | Consecutive heartbeat failures before marking prefill server offline | `2` | | **`SGLANG_DISAGGREGATION_WAITING_TIMEOUT`** | Timeout (seconds) for receiving KV Cache after request initialization | `300` | -| **`SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION`** | Clip request param "max_tokens" to pre_allocate | `4096` | If a greater mean TTFT is acceptable, you can `export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600` (10 minutes) to relax the timeout condition. diff --git a/docs/references/environment_variables.md b/docs/references/environment_variables.md index 2ce931b03..f22685454 100644 --- a/docs/references/environment_variables.md +++ b/docs/references/environment_variables.md @@ -45,7 +45,7 @@ SGLang supports various environment variables that can be used to configure its | Environment Variable | Description | Default Value | | --- | --- | --- | | `SGLANG_DEBUG_MEMORY_POOL` | Enable memory pool debugging | `false` | -| `SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION` | Clip max new tokens estimation for memory planning | Not set | +| `SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION` | Clip max new tokens estimation for memory planning | `4096` | | `SGLANG_DETOKENIZER_MAX_STATES` | Maximum states for detokenizer | Default value based on system | | `SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK` | Disable checks for memory imbalance across Tensor Parallel ranks | Not set (defaults to enabled check) | diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py index f38535597..02f297d6a 100644 --- a/python/sglang/srt/disaggregation/decode.py +++ b/python/sglang/srt/disaggregation/decode.py @@ -59,9 +59,7 @@ if TYPE_CHECKING: from sglang.srt.managers.schedule_batch import Req from sglang.srt.managers.scheduler import Scheduler -DECODE_CLIP_MAX_NEW_TOKEN = get_int_env_var( - "SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION", 4096 -) +CLIP_MAX_NEW_TOKEN = get_int_env_var("SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION", 4096) class DecodeReqToTokenPool: @@ -390,7 +388,7 @@ class DecodePreallocQueue: origin_input_len + min( decode_req.req.sampling_params.max_new_tokens, - DECODE_CLIP_MAX_NEW_TOKEN, + CLIP_MAX_NEW_TOKEN, ) - retractable_tokens, ) @@ -440,7 +438,7 @@ class DecodePreallocQueue: need_space_for_single_req = ( max( [ - min(x.sampling_params.max_new_tokens, DECODE_CLIP_MAX_NEW_TOKEN) + min(x.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKEN) + len(x.origin_input_ids) - retractable_tokens for x in self.scheduler.running_batch.reqs diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py index eb14b9835..4665207c1 100644 --- a/python/sglang/srt/managers/schedule_policy.py +++ b/python/sglang/srt/managers/schedule_policy.py @@ -36,7 +36,7 @@ if TYPE_CHECKING: # This can prevent the server from being too conservative. # Note that this only clips the estimation in the scheduler but does not change the stop # condition. The request can still generate tokens until it hits the unclipped max_new_tokens. -CLIP_MAX_NEW_TOKENS_ESTIMATION = int( +CLIP_MAX_NEW_TOKENS = int( os.environ.get("SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION", "4096") ) @@ -305,7 +305,7 @@ class PrefillAdder: [ min( (r.sampling_params.max_new_tokens - len(r.output_ids)), - CLIP_MAX_NEW_TOKENS_ESTIMATION, + CLIP_MAX_NEW_TOKENS, ) * self.new_token_ratio for r in running_batch.reqs @@ -388,7 +388,7 @@ class PrefillAdder: 0, req.extend_input_len, ( - min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION) + min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS) if not truncated else 0 ), @@ -477,7 +477,7 @@ class PrefillAdder: self._update_prefill_budget( 0, req.extend_input_len, - min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION), + min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS), ) else: if self.rem_chunk_tokens == 0: @@ -499,7 +499,7 @@ class PrefillAdder: return self.add_one_req_ignore_eos(req, has_chunked_req) total_tokens = req.extend_input_len + min( - req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION + req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS ) # adjusting the input_tokens based on host_hit_length and page_size @@ -544,7 +544,7 @@ class PrefillAdder: input_tokens, min( req.sampling_params.max_new_tokens, - CLIP_MAX_NEW_TOKENS_ESTIMATION, + CLIP_MAX_NEW_TOKENS, ), ) else: