From 45d6592d4053fe8b2b8dc9440f64c900de040d09 Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Sat, 3 Feb 2024 04:59:06 -0800 Subject: [PATCH] Fix no-cache mode (#136) --- python/sglang/srt/managers/router/infer_batch.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/managers/router/infer_batch.py b/python/sglang/srt/managers/router/infer_batch.py index 0c49f7d86..88f6031f7 100644 --- a/python/sglang/srt/managers/router/infer_batch.py +++ b/python/sglang/srt/managers/router/infer_batch.py @@ -215,8 +215,9 @@ class Batch: extend_num_tokens = seq_lens.sum() - prefix_lens.sum() out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens) if out_cache_loc is None: - self.tree_cache.evict(extend_num_tokens, self.token_to_kv_pool.free) - out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens) + if not self.tree_cache.disable: + self.tree_cache.evict(extend_num_tokens, self.token_to_kv_pool.free) + out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens) if out_cache_loc is None: print("Prefill out of memory. This should nerver happen.") @@ -277,11 +278,11 @@ class Batch: def check_decode_mem(self): bs = len(self.reqs) - avai_size = self.token_to_kv_pool.available_size() - if avai_size >= bs: + if self.token_to_kv_pool.available_size() >= bs: return True - self.tree_cache.evict(bs, self.token_to_kv_pool.free) + if not self.tree_cache.disable: + self.tree_cache.evict(bs, self.token_to_kv_pool.free) if self.token_to_kv_pool.available_size() >= bs: return True