clean up step function (#635)
This commit is contained in:
@@ -228,23 +228,7 @@ class ModelTpServer:
|
|||||||
|
|
||||||
# Print stats
|
# Print stats
|
||||||
if self.tp_rank == 0 and self.decode_forward_ct % 40 == 0:
|
if self.tp_rank == 0 and self.decode_forward_ct % 40 == 0:
|
||||||
num_used = self.max_total_num_tokens - (
|
self.print_stats()
|
||||||
self.token_to_kv_pool.available_size()
|
|
||||||
+ self.tree_cache.evictable_size()
|
|
||||||
)
|
|
||||||
throughput = self.num_generated_tokens / (
|
|
||||||
time.time() - self.last_stats_tic
|
|
||||||
)
|
|
||||||
self.num_generated_tokens = 0
|
|
||||||
self.last_stats_tic = time.time()
|
|
||||||
logger.info(
|
|
||||||
f"[gpu_id={self.gpu_id}] Decode batch. "
|
|
||||||
f"#running-req: {len(self.running_batch.reqs)}, "
|
|
||||||
f"#token: {num_used}, "
|
|
||||||
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
|
|
||||||
f"gen throughput (token/s): {throughput:.2f}, "
|
|
||||||
f"#queue-req: {len(self.forward_queue)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.running_batch.is_empty():
|
if self.running_batch.is_empty():
|
||||||
self.running_batch = None
|
self.running_batch = None
|
||||||
@@ -253,17 +237,38 @@ class ModelTpServer:
|
|||||||
if self.out_pyobjs and self.running_batch.has_stream():
|
if self.out_pyobjs and self.running_batch.has_stream():
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
# Check the available size
|
self.check_memory()
|
||||||
available_size = (
|
|
||||||
self.token_to_kv_pool.available_size()
|
def print_stats(self):
|
||||||
+ self.tree_cache.evictable_size()
|
num_used = self.max_total_num_tokens - (
|
||||||
)
|
self.token_to_kv_pool.available_size()
|
||||||
if available_size != self.max_total_num_tokens:
|
+ self.tree_cache.evictable_size()
|
||||||
warnings.warn(
|
)
|
||||||
"Warning: "
|
throughput = self.num_generated_tokens / (
|
||||||
f"available_size={available_size}, max_total_num_tokens={self.max_total_num_tokens}\n"
|
time.time() - self.last_stats_tic
|
||||||
"KV cache pool leak detected!"
|
)
|
||||||
)
|
self.num_generated_tokens = 0
|
||||||
|
self.last_stats_tic = time.time()
|
||||||
|
logger.info(
|
||||||
|
f"[gpu_id={self.gpu_id}] Decode batch. "
|
||||||
|
f"#running-req: {len(self.running_batch.reqs)}, "
|
||||||
|
f"#token: {num_used}, "
|
||||||
|
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
|
||||||
|
f"gen throughput (token/s): {throughput:.2f}, "
|
||||||
|
f"#queue-req: {len(self.forward_queue)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_memory(self):
|
||||||
|
available_size = (
|
||||||
|
self.token_to_kv_pool.available_size()
|
||||||
|
+ self.tree_cache.evictable_size()
|
||||||
|
)
|
||||||
|
if available_size != self.max_total_num_tokens:
|
||||||
|
warnings.warn(
|
||||||
|
"Warning: "
|
||||||
|
f"available_size={available_size}, max_total_num_tokens={self.max_total_num_tokens}\n"
|
||||||
|
"KV cache pool leak detected!"
|
||||||
|
)
|
||||||
|
|
||||||
def handle_generate_request(
|
def handle_generate_request(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Reference in New Issue
Block a user