[PD]decode: add CLIP_MAX_NEW_TOKEN for pop_preallocated (#8866)

This commit is contained in:
Jimmy
2025-08-12 04:08:11 +08:00
committed by GitHub
parent 3d64fda376
commit 0d9e89ec69
2 changed files with 11 additions and 3 deletions

View File

@@ -51,7 +51,7 @@ from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
from sglang.srt.mem_cache.memory_pool import KVCache, ReqToTokenPool
from sglang.srt.model_executor.forward_batch_info import ForwardMode
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
from sglang.srt.utils import require_mlp_sync
from sglang.srt.utils import get_int_env_var, require_mlp_sync
logger = logging.getLogger(__name__)
@@ -59,6 +59,10 @@ if TYPE_CHECKING:
from sglang.srt.managers.schedule_batch import Req
from sglang.srt.managers.scheduler import Scheduler
DECODE_CLIP_MAX_NEW_TOKEN = get_int_env_var(
"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION", 4096
)
class DecodeReqToTokenPool:
"""
@@ -384,7 +388,10 @@ class DecodePreallocQueue:
max(
required_tokens_for_request,
origin_input_len
+ decode_req.req.sampling_params.max_new_tokens
+ min(
decode_req.req.sampling_params.max_new_tokens,
DECODE_CLIP_MAX_NEW_TOKEN,
)
- retractable_tokens,
)
> allocatable_tokens
@@ -433,7 +440,7 @@ class DecodePreallocQueue:
need_space_for_single_req = (
max(
[
x.sampling_params.max_new_tokens
min(x.sampling_params.max_new_tokens, DECODE_CLIP_MAX_NEW_TOKEN)
+ len(x.origin_input_ids)
- retractable_tokens
for x in self.scheduler.running_batch.reqs