[PD] Allow customizing reserved tokens to avoid KV cache waste (#6002)

This commit is contained in:
fzyzcjy
2025-05-05 11:23:15 +08:00
committed by GitHub
parent 357fb2dba5
commit 3008db9c1a

View File

@@ -97,7 +97,9 @@ class DecodePreallocQueue:
self.tp_size = tp_size
self.bootstrap_port = bootstrap_port
self.num_reserved_decode_tokens = 512
self.num_reserved_decode_tokens = int(
os.environ.get("SGLANG_NUM_RESERVED_DECODE_TOKENS", "512")
)
# Queue for requests pending pre-allocation
self.queue: List[DecodeRequest] = []