Reduce memory accumulation in long-running server (#8306)
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
This commit is contained in:
@@ -30,7 +30,11 @@ class GlobalConfig:
|
|||||||
self.default_new_token_ratio_decay_steps = float(
|
self.default_new_token_ratio_decay_steps = float(
|
||||||
os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600)
|
os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600)
|
||||||
)
|
)
|
||||||
|
self.torch_empty_cache_interval = float(
|
||||||
|
os.environ.get(
|
||||||
|
"SGLANG_EMPTY_CACHE_INTERVAL", -1
|
||||||
|
) # in seconds. Set if you observe high memory accumulation over a long serving period.
|
||||||
|
)
|
||||||
# Runtime constants: others
|
# Runtime constants: others
|
||||||
self.retract_decode_steps = 20
|
self.retract_decode_steps = 20
|
||||||
self.flashinfer_workspace_size = os.environ.get(
|
self.flashinfer_workspace_size = os.environ.get(
|
||||||
|
|||||||
@@ -2362,11 +2362,19 @@ class IdleSleeper:
|
|||||||
|
|
||||||
def __init__(self, sockets):
|
def __init__(self, sockets):
|
||||||
self.poller = zmq.Poller()
|
self.poller = zmq.Poller()
|
||||||
|
self.last_empty_time = time.time()
|
||||||
for s in sockets:
|
for s in sockets:
|
||||||
self.poller.register(s, zmq.POLLIN)
|
self.poller.register(s, zmq.POLLIN)
|
||||||
|
|
||||||
def maybe_sleep(self):
|
def maybe_sleep(self):
|
||||||
self.poller.poll(1000)
|
self.poller.poll(1000)
|
||||||
|
if (
|
||||||
|
global_config.torch_empty_cache_interval > 0
|
||||||
|
and time.time() - self.last_empty_time
|
||||||
|
> global_config.torch_empty_cache_interval
|
||||||
|
):
|
||||||
|
self.last_empty_time = time.time()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
def is_health_check_generate_req(recv_req):
|
def is_health_check_generate_req(recv_req):
|
||||||
|
|||||||
Reference in New Issue
Block a user