[Feature] Speculative decoding support lookahead (#9873)

Co-authored-by: a4zhangfei <a4zhangfei@qq.com>
Co-authored-by: Qiaolin-Yu <liin1211@outlook.com>
This commit is contained in:
Zhihao Zhang
2025-09-19 07:42:41 +08:00
committed by GitHub
parent 2a2ff9a840
commit e7bc600304
30 changed files with 2058 additions and 32 deletions

View File

@@ -84,6 +84,7 @@ from sglang.srt.managers.tokenizer_communicator_mixin import TokenizerCommunicat
from sglang.srt.metrics.collector import TokenizerMetricsCollector
from sglang.srt.sampling.sampling_params import SamplingParams
from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
from sglang.srt.tracing.trace import (
trace_get_proc_propagate_context,
trace_req_finish,
@@ -174,6 +175,15 @@ class TokenizerManager(TokenizerCommunicatorMixin):
self.image_token_id = self.model_config.image_token_id
self.max_req_input_len = None # Will be set later in engine.py
speculative_algorithm = SpeculativeAlgorithm.from_string(
server_args.speculative_algorithm
)
self.reserve_input_token_num = (
0
if speculative_algorithm.is_none()
else server_args.speculative_num_draft_tokens
)
if self.model_config.is_multimodal:
import_processors()
try:
@@ -618,6 +628,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
_max_req_len = self.context_len
input_token_num = len(input_ids) if input_ids is not None else 0
input_token_num += self.reserve_input_token_num
if input_token_num >= self.context_len:
if self.server_args.allow_auto_truncate:
logger.warning(