[PD] Allow customizing reserved tokens to avoid KV cache waste (#6002)
This commit is contained in:
@@ -97,7 +97,9 @@ class DecodePreallocQueue:
|
|||||||
self.tp_size = tp_size
|
self.tp_size = tp_size
|
||||||
self.bootstrap_port = bootstrap_port
|
self.bootstrap_port = bootstrap_port
|
||||||
|
|
||||||
self.num_reserved_decode_tokens = 512
|
self.num_reserved_decode_tokens = int(
|
||||||
|
os.environ.get("SGLANG_NUM_RESERVED_DECODE_TOKENS", "512")
|
||||||
|
)
|
||||||
|
|
||||||
# Queue for requests pending pre-allocation
|
# Queue for requests pending pre-allocation
|
||||||
self.queue: List[DecodeRequest] = []
|
self.queue: List[DecodeRequest] = []
|
||||||
|
|||||||
Reference in New Issue
Block a user