[Generative Score API] Scoring(Prefill-only) optimizations. (#9748)

This commit is contained in:
Sundara Raman Ramachandran
2025-09-13 10:57:06 -07:00
committed by GitHub
parent 94d0f656fb
commit a360511d7b
9 changed files with 325 additions and 48 deletions

View File

@@ -1261,11 +1261,19 @@ class Scheduler(
# Copy more attributes
if recv_req.logprob_start_len == -1 or not recv_req.return_logprob:
# By default, only return the logprobs for output tokens
req.logprob_start_len = len(req.origin_input_ids) - 1
# For prefill-only requests with logprob_start_len == -1, set logprob_start_len beyond input sequence
# to skip input logprob computation entirely
if req.is_prefill_only:
req.logprob_start_len = len(req.origin_input_ids)
else:
# TODO: For text generation, evaluate setting logprob_start_len to len(req.origin_input_ids) as well
req.logprob_start_len = len(req.origin_input_ids) - 1
else:
req.logprob_start_len = recv_req.logprob_start_len
if req.logprob_start_len >= len(req.origin_input_ids):
if not req.is_prefill_only and req.logprob_start_len >= len(
req.origin_input_ids
):
error_msg = f"{req.logprob_start_len=} is higher than the number of input tokens {len(req.origin_input_ids)=}. Please use a smaller logprob_start_len."
req.logprob_start_len = len(req.origin_input_ids) - 1
req.set_finish_with_abort(error_msg)