improve print
This commit is contained in:
@@ -4,7 +4,7 @@ from typing import Callable, List, Optional, Union
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sglang.backend.base_backend import BaseBackend
|
from sglang.backend.base_backend import BaseBackend
|
||||||
from sglang.lang.chat_template import get_chat_template_by_model_path, ChatTemplate
|
from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path
|
||||||
from sglang.lang.interpreter import StreamExecutor
|
from sglang.lang.interpreter import StreamExecutor
|
||||||
from sglang.lang.ir import SglSamplingParams
|
from sglang.lang.ir import SglSamplingParams
|
||||||
|
|
||||||
@@ -41,11 +41,15 @@ INSTRUCT_MODEL_NAMES = [
|
|||||||
|
|
||||||
|
|
||||||
class OpenAI(BaseBackend):
|
class OpenAI(BaseBackend):
|
||||||
def __init__(self, model_name: str,
|
def __init__(
|
||||||
is_chat_model: Optional[bool] = None,
|
self,
|
||||||
chat_template: Optional[ChatTemplate] = None,
|
model_name: str,
|
||||||
is_azure: bool = False,
|
is_chat_model: Optional[bool] = None,
|
||||||
*args, **kwargs):
|
chat_template: Optional[ChatTemplate] = None,
|
||||||
|
is_azure: bool = False,
|
||||||
|
*args,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
if isinstance(openai, Exception):
|
if isinstance(openai, Exception):
|
||||||
@@ -63,7 +67,9 @@ class OpenAI(BaseBackend):
|
|||||||
self.tokenizer = tiktoken.get_encoding("cl100k_base")
|
self.tokenizer = tiktoken.get_encoding("cl100k_base")
|
||||||
self.logit_bias_int = create_logit_bias_int(self.tokenizer)
|
self.logit_bias_int = create_logit_bias_int(self.tokenizer)
|
||||||
|
|
||||||
self.chat_template = chat_template or get_chat_template_by_model_path(model_name)
|
self.chat_template = chat_template or get_chat_template_by_model_path(
|
||||||
|
model_name
|
||||||
|
)
|
||||||
|
|
||||||
if is_chat_model is not None:
|
if is_chat_model is not None:
|
||||||
self.is_chat_model = is_chat_model
|
self.is_chat_model = is_chat_model
|
||||||
|
|||||||
@@ -208,6 +208,19 @@ class ModelRpcServer(rpyc.Service):
|
|||||||
|
|
||||||
if self.out_pyobjs and self.running_batch.reqs[0].stream:
|
if self.out_pyobjs and self.running_batch.reqs[0].stream:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if self.running_batch is not None and self.tp_rank == 0:
|
||||||
|
if self.decode_forward_ct % 40 == 0:
|
||||||
|
num_used = self.max_total_num_token - (
|
||||||
|
self.token_to_kv_pool.available_size()
|
||||||
|
+ self.tree_cache.evictable_size()
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"#running-req: {len(self.running_batch.reqs)}, "
|
||||||
|
f"#token: {num_used}, "
|
||||||
|
f"token usage: {num_used / self.max_total_num_token:.2f}, "
|
||||||
|
f"#queue-req: {len(self.forward_queue)}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# check the available size
|
# check the available size
|
||||||
available_size = (
|
available_size = (
|
||||||
@@ -221,19 +234,6 @@ class ModelRpcServer(rpyc.Service):
|
|||||||
"KV cache pool leak detected!"
|
"KV cache pool leak detected!"
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.running_batch is not None and self.tp_rank == 0:
|
|
||||||
if self.decode_forward_ct % 20 == 0:
|
|
||||||
num_used = self.max_total_num_token - (
|
|
||||||
self.token_to_kv_pool.available_size()
|
|
||||||
+ self.tree_cache.evictable_size()
|
|
||||||
)
|
|
||||||
logger.info(
|
|
||||||
f"#running-req: {len(self.running_batch.reqs)}, "
|
|
||||||
f"#token: {num_used}, "
|
|
||||||
f"token usage: {num_used / self.max_total_num_token:.2f}, "
|
|
||||||
f"#queue-req: {len(self.forward_queue)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
def handle_generate_request(
|
def handle_generate_request(
|
||||||
self,
|
self,
|
||||||
recv_req: TokenizedGenerateReqInput,
|
recv_req: TokenizedGenerateReqInput,
|
||||||
|
|||||||
Reference in New Issue
Block a user