From 6493256b7d4b290ede988e5ee5425508249064c7 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 12 Feb 2024 12:43:48 +0000 Subject: [PATCH] improve print --- python/sglang/backend/openai.py | 20 +++++++++----- .../sglang/srt/managers/router/model_rpc.py | 26 +++++++++---------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/python/sglang/backend/openai.py b/python/sglang/backend/openai.py index 86d9d9a8a..6cad2f6aa 100644 --- a/python/sglang/backend/openai.py +++ b/python/sglang/backend/openai.py @@ -4,7 +4,7 @@ from typing import Callable, List, Optional, Union import numpy as np from sglang.backend.base_backend import BaseBackend -from sglang.lang.chat_template import get_chat_template_by_model_path, ChatTemplate +from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path from sglang.lang.interpreter import StreamExecutor from sglang.lang.ir import SglSamplingParams @@ -41,11 +41,15 @@ INSTRUCT_MODEL_NAMES = [ class OpenAI(BaseBackend): - def __init__(self, model_name: str, - is_chat_model: Optional[bool] = None, - chat_template: Optional[ChatTemplate] = None, - is_azure: bool = False, - *args, **kwargs): + def __init__( + self, + model_name: str, + is_chat_model: Optional[bool] = None, + chat_template: Optional[ChatTemplate] = None, + is_azure: bool = False, + *args, + **kwargs, + ): super().__init__() if isinstance(openai, Exception): @@ -63,7 +67,9 @@ class OpenAI(BaseBackend): self.tokenizer = tiktoken.get_encoding("cl100k_base") self.logit_bias_int = create_logit_bias_int(self.tokenizer) - self.chat_template = chat_template or get_chat_template_by_model_path(model_name) + self.chat_template = chat_template or get_chat_template_by_model_path( + model_name + ) if is_chat_model is not None: self.is_chat_model = is_chat_model diff --git a/python/sglang/srt/managers/router/model_rpc.py b/python/sglang/srt/managers/router/model_rpc.py index 1db787603..c89fa67f3 100644 --- a/python/sglang/srt/managers/router/model_rpc.py +++ b/python/sglang/srt/managers/router/model_rpc.py @@ -208,6 +208,19 @@ class ModelRpcServer(rpyc.Service): if self.out_pyobjs and self.running_batch.reqs[0].stream: break + + if self.running_batch is not None and self.tp_rank == 0: + if self.decode_forward_ct % 40 == 0: + num_used = self.max_total_num_token - ( + self.token_to_kv_pool.available_size() + + self.tree_cache.evictable_size() + ) + logger.info( + f"#running-req: {len(self.running_batch.reqs)}, " + f"#token: {num_used}, " + f"token usage: {num_used / self.max_total_num_token:.2f}, " + f"#queue-req: {len(self.forward_queue)}" + ) else: # check the available size available_size = ( @@ -221,19 +234,6 @@ class ModelRpcServer(rpyc.Service): "KV cache pool leak detected!" ) - if self.running_batch is not None and self.tp_rank == 0: - if self.decode_forward_ct % 20 == 0: - num_used = self.max_total_num_token - ( - self.token_to_kv_pool.available_size() - + self.tree_cache.evictable_size() - ) - logger.info( - f"#running-req: {len(self.running_batch.reqs)}, " - f"#token: {num_used}, " - f"token usage: {num_used / self.max_total_num_token:.2f}, " - f"#queue-req: {len(self.forward_queue)}" - ) - def handle_generate_request( self, recv_req: TokenizedGenerateReqInput,