From 138ff23187a8c75f68ecc7afddf33f2d3ee494d4 Mon Sep 17 00:00:00 2001 From: Jue WANG Date: Wed, 22 Oct 2025 21:57:12 -0700 Subject: [PATCH] Allow to disable batch decoding. (#11944) --- .../srt/managers/detokenizer_manager.py | 45 ++++++++++++++----- python/sglang/srt/server_args.py | 6 +++ 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index b3c6df7d5..8db48f82d 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -108,6 +108,7 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin): ) self.is_tool_call_parser_gpt_oss = server_args.tool_call_parser == "gpt-oss" + self.disable_tokenizer_batch_decode = server_args.disable_tokenizer_batch_decode def event_loop(self): """The event loop that handles requests""" @@ -176,17 +177,39 @@ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin): ) surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset]) - # TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request - surr_texts = self.tokenizer.batch_decode( - surr_ids, - skip_special_tokens=recv_obj.skip_special_tokens[0], - spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0], - ) - read_texts = self.tokenizer.batch_decode( - read_ids, - skip_special_tokens=recv_obj.skip_special_tokens[0], - spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0], - ) + # TODO(lmzheng): better handle skip_special_tokens/spaces_between_special_tokens per request + if self.disable_tokenizer_batch_decode: + surr_texts = [ + self.tokenizer.decode( + surr, skip_special_tokens=skip, spaces_between_special_tokens=space + ) + for surr, skip, space in zip( + surr_ids, + recv_obj.skip_special_tokens, + recv_obj.spaces_between_special_tokens, + ) + ] + read_texts = [ + self.tokenizer.decode( + read, skip_special_tokens=skip, spaces_between_special_tokens=space + ) + for read, skip, space in zip( + read_ids, + recv_obj.skip_special_tokens, + recv_obj.spaces_between_special_tokens, + ) + ] + else: + surr_texts = self.tokenizer.batch_decode( + surr_ids, + skip_special_tokens=recv_obj.skip_special_tokens[0], + spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0], + ) + read_texts = self.tokenizer.batch_decode( + read_ids, + skip_special_tokens=recv_obj.skip_special_tokens[0], + spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0], + ) # Incremental decoding output_strs = [] diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 7435725dc..2f992cecd 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -433,6 +433,7 @@ class ServerArgs: enable_symm_mem: bool = False disable_flashinfer_cutlass_moe_fp4_allgather: bool = False enable_tokenizer_batch_encode: bool = False + disable_tokenizer_batch_decode: bool = False disable_outlines_disk_cache: bool = False disable_custom_all_reduce: bool = False enable_mscclpp: bool = False @@ -2898,6 +2899,11 @@ class ServerArgs: action="store_true", help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.", ) + parser.add_argument( + "--disable-tokenizer-batch-decode", + action="store_true", + help="Disable batch decoding when decoding multiple completions.", + ) parser.add_argument( "--disable-outlines-disk-cache", action="store_true",