[Feature] Speculative decoding support lookahead (#9873)
Co-authored-by: a4zhangfei <a4zhangfei@qq.com> Co-authored-by: Qiaolin-Yu <liin1211@outlook.com>
This commit is contained in:
@@ -84,6 +84,7 @@ from sglang.srt.managers.tokenizer_communicator_mixin import TokenizerCommunicat
|
||||
from sglang.srt.metrics.collector import TokenizerMetricsCollector
|
||||
from sglang.srt.sampling.sampling_params import SamplingParams
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
||||
from sglang.srt.tracing.trace import (
|
||||
trace_get_proc_propagate_context,
|
||||
trace_req_finish,
|
||||
@@ -174,6 +175,15 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
||||
self.image_token_id = self.model_config.image_token_id
|
||||
self.max_req_input_len = None # Will be set later in engine.py
|
||||
|
||||
speculative_algorithm = SpeculativeAlgorithm.from_string(
|
||||
server_args.speculative_algorithm
|
||||
)
|
||||
self.reserve_input_token_num = (
|
||||
0
|
||||
if speculative_algorithm.is_none()
|
||||
else server_args.speculative_num_draft_tokens
|
||||
)
|
||||
|
||||
if self.model_config.is_multimodal:
|
||||
import_processors()
|
||||
try:
|
||||
@@ -618,6 +628,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
|
||||
_max_req_len = self.context_len
|
||||
|
||||
input_token_num = len(input_ids) if input_ids is not None else 0
|
||||
input_token_num += self.reserve_input_token_num
|
||||
if input_token_num >= self.context_len:
|
||||
if self.server_args.allow_auto_truncate:
|
||||
logger.warning(
|
||||
|
||||
Reference in New Issue
Block a user