From d77caa2b757044f84e0078336b43de531cdd5688 Mon Sep 17 00:00:00 2001 From: Seungduk Kim Date: Mon, 20 Jan 2025 04:36:53 +0900 Subject: [PATCH] [#2812] Make the decode status dict capcity adjustable by a CLI param (#2839) --- .../srt/managers/detokenizer_manager.py | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index a8dc14f01..972f9595b 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -15,6 +15,7 @@ import dataclasses import logging +import os import signal from collections import OrderedDict from typing import Dict, List, Union @@ -35,6 +36,12 @@ from sglang.utils import find_printable_text, get_exception_traceback logger = logging.getLogger(__name__) +# Maximum number of request states that detokenizer can hold. When exceeded, +# oldest request states will be evicted. Default: 65536 (1<<16). +# For more details, see: https://github.com/sgl-project/sglang/issues/2812 +# Use power of 2 values for better memory allocation. +DETOKENIZER_MAX_STATES = int(os.environ.get("SGLANG_DETOKENIZER_MAX_STATES", 1 << 16)) + @dataclasses.dataclass class DecodeStatus: @@ -74,7 +81,7 @@ class DetokenizerManager: revision=server_args.revision, ) - self.decode_status = LimitedCapacityDict() + self.decode_status = LimitedCapacityDict(capacity=DETOKENIZER_MAX_STATES) def trim_matched_stop( self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool @@ -156,7 +163,17 @@ class DetokenizerManager: # Incremental decoding output_strs = [] for i in range(bs): - s = self.decode_status[recv_obj.rids[i]] + try: + s = self.decode_status[recv_obj.rids[i]] + except KeyError: + raise RuntimeError( + f"Decode status not found for request {recv_obj.rids[i]}. " + "It may be due to the request being evicted from the decode status due to memory pressure. " + "Please increase the maximum number of requests by setting " + "the SGLANG_DETOKENIZER_MAX_STATES environment variable to a bigger value than the default value. " + f"The current value is {DETOKENIZER_MAX_STATES}. " + "For more details, see: https://github.com/sgl-project/sglang/issues/2812" + ) new_text = read_texts[i][len(surr_texts[i]) :] if recv_obj.finished_reasons[i] is None: # Streaming chunk: update the decode status @@ -197,7 +214,7 @@ class DetokenizerManager: class LimitedCapacityDict(OrderedDict): - def __init__(self, capacity=1 << 15, *args, **kwargs): + def __init__(self, capacity: int, *args, **kwargs): super().__init__(*args, **kwargs) self.capacity = capacity