Fix #2037 - Context length check does not take into out pad tokens for visual models (#2106)

2024-11-21 19:05:41 -08:00
parent 30af7dfb34
commit 8048c28c11
2 changed files with 67 additions and 0 deletions
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -557,6 +557,15 @@ class Scheduler:
                req.origin_input_ids_unpadded, req.image_inputs
            )

+            if len(req.origin_input_ids) > self.max_req_input_len:
+                req.finished_reason = FINISH_ABORT(
+                    "Image request length is longer than the KV cache pool size or "
+                    "the max context length aborting because you cannot truncate the image embeds"
+                )
+                req.sampling_params.max_new_tokens = 0
+                self.waiting_queue.append(req)
+                return
+
        req.return_logprob = recv_req.return_logprob
        req.top_logprobs_num = recv_req.top_logprobs_num
        req.stream = recv_req.stream