[Fix] memory leak by overlap + retract (#11981)

Co-authored-by: Liangsheng Yin <lsyincs@gmail.com>
This commit is contained in:
cctry
2025-10-23 07:59:23 -07:00
committed by GitHub
parent 6c18addb6f
commit b0b4f71679
9 changed files with 132 additions and 25 deletions

View File

@@ -194,7 +194,8 @@ from sglang.utils import TypeBasedDispatcher, get_exception_traceback
logger = logging.getLogger(__name__)
# Test retract decode for debugging purposes
TEST_RETRACT = get_bool_env_var("SGLANG_TEST_RETRACT")
TEST_RETRACT = envs.SGLANG_TEST_RETRACT.get()
TEST_RETRACT_INTERVAL = envs.SGLANG_TEST_RETRACT_INTERVAL.get()
GRAMMAR_TIMEOUT = float(os.environ.get("SGLANG_GRAMMAR_TIMEOUT", 300))
@@ -1017,6 +1018,9 @@ class Scheduler(
self.launch_batch_sample_if_needed(batch_result)
self.last_batch = batch
if envs.SGLANG_ENABLE_RUNTIME_MEM_LEAK_CHECK.get():
self._check_runtime_mem_leak()
def recv_requests(self) -> List[Req]:
"""Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""
@@ -1833,7 +1837,7 @@ class Scheduler(
# Check if decode out of memory
if not batch.check_decode_mem(self.decode_mem_cache_buf_multiplier) or (
TEST_RETRACT and batch.batch_size() > 10
TEST_RETRACT and self.forward_ct % TEST_RETRACT_INTERVAL == 0
):
old_ratio = self.new_token_ratio
retracted_reqs, new_token_ratio, reqs_to_abort = batch.retract_decode(