[Performance] Dynamic Batch Tokenizer (#9382)

This commit is contained in:
Sundara Raman Ramachandran
2025-09-13 10:56:04 -07:00
committed by GitHub
parent eca59f96c3
commit 94d0f656fb
5 changed files with 1041 additions and 11 deletions

View File

@@ -373,6 +373,11 @@ class ServerArgs:
scheduler_recv_interval: int = 1
numa_node: Optional[List[int]] = None
# Dynamic batch tokenizer
enable_dynamic_batch_tokenizer: bool = False
dynamic_batch_tokenizer_batch_size: int = 32
dynamic_batch_tokenizer_batch_timeout: float = 0.002
# Debug tensor dumps
debug_tensor_dump_output_folder: Optional[str] = None
debug_tensor_dump_input_file: Optional[str] = None
@@ -874,6 +879,13 @@ class ServerArgs:
self.disable_cuda_graph = True
logger.warning("Cuda graph is disabled for prefill server")
# Validation: prevent both tokenizer batching features from being enabled
if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
raise ValueError(
"Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
"Please choose one tokenizer batching approach."
)
# Propagate env vars
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
"1" if self.enable_torch_compile else "0"
@@ -2162,6 +2174,23 @@ class ServerArgs:
action="store_true",
help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
)
parser.add_argument(
"--enable-dynamic-batch-tokenizer",
action="store_true",
help="Enable async dynamic batch tokenizer for improved performance when multiple requests arrive concurrently.",
)
parser.add_argument(
"--dynamic-batch-tokenizer-batch-size",
type=int,
default=ServerArgs.dynamic_batch_tokenizer_batch_size,
help="[Only used if --enable-dynamic-batch-tokenizer is set] Maximum batch size for dynamic batch tokenizer.",
)
parser.add_argument(
"--dynamic-batch-tokenizer-batch-timeout",
type=float,
default=ServerArgs.dynamic_batch_tokenizer_batch_timeout,
help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
)
# PD disaggregation
parser.add_argument(