diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index bc9e4a53b..b4bc1e7a4 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -29,7 +29,6 @@ from sglang.srt.managers.io_struct import ( BatchStrOut, BatchTokenIDOut, ) -from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN from sglang.srt.server_args import PortArgs, ServerArgs from sglang.srt.utils import configure_logger, get_zmq_socket from sglang.utils import find_printable_text, get_exception_traceback diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 4ece87868..4680b042d 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1198,6 +1198,7 @@ class Scheduler: decode_ids_list = [] read_offsets = [] output_ids = [] + skip_special_tokens = [] spaces_between_special_tokens = [] no_stop_trim = [] diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 29b98df2e..8f147bf8b 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -623,23 +623,23 @@ class TokenizerManager: i, ) + if not isinstance(recv_obj, BatchEmbeddingOut): + meta_info.update( + { + "completion_tokens": recv_obj.completion_tokens[i], + "cached_tokens": recv_obj.cached_tokens[i], + } + ) + if isinstance(recv_obj, BatchStrOut): out_dict = { "text": recv_obj.output_strs[i], - "meta_info": { - **meta_info, - "completion_tokens": recv_obj.completion_tokens[i], - "cached_tokens": recv_obj.cached_tokens[i], - }, + "meta_info": meta_info, } elif isinstance(recv_obj, BatchTokenIDOut): out_dict = { "token_ids": recv_obj.output_ids[i], - "meta_info": { - **meta_info, - "completion_tokens": recv_obj.completion_tokens[i], - "cached_tokens": recv_obj.cached_tokens[i], - }, + "meta_info": meta_info, } else: assert isinstance(recv_obj, BatchEmbeddingOut) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 3f0cbecac..ebda816db 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -114,7 +114,7 @@ class ModelRunner: server_args.chunked_prefill_size = -1 self.mem_fraction_static *= 0.95 logger.info( - f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static} " + f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} " f"and turn off chunked prefill " f"because this is a multimodal model." )