diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py index 1570b8b32..4c761c9a6 100644 --- a/python/sglang/srt/disaggregation/decode.py +++ b/python/sglang/srt/disaggregation/decode.py @@ -259,7 +259,7 @@ class DecodePreallocQueue: if len(req.origin_input_ids) > self.max_total_num_tokens: message = f"Request {req.rid} exceeds the maximum number of tokens: {len(req.origin_input_ids)} > {self.max_total_num_tokens}" logger.error(message) - prepare_abort(req, message) + prepare_abort(req, message, status_code=HTTPStatus.BAD_REQUEST) self.scheduler.stream_output([req], req.return_logprob) return True return False diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py index 675e3708a..5f5d0ebc6 100644 --- a/python/sglang/srt/disaggregation/prefill.py +++ b/python/sglang/srt/disaggregation/prefill.py @@ -178,7 +178,7 @@ class PrefillBootstrapQueue: if len(req.origin_input_ids) > self.max_total_num_tokens: message = f"Request {req.rid} exceeds the maximum number of tokens: {len(req.origin_input_ids)} > {self.max_total_num_tokens}" logger.error(message) - prepare_abort(req, message) + prepare_abort(req, message, status_code=HTTPStatus.BAD_REQUEST) self.scheduler.stream_output([req], req.return_logprob) return True return False diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 05878fe4e..91e02b08e 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1141,7 +1141,7 @@ class Scheduler( f"boostrap room id. {req.rid=}" ) logger.error(error_msg) - prepare_abort(req, error_msg) + prepare_abort(req, error_msg, status_code=HTTPStatus.BAD_REQUEST) self.stream_output([req], req.return_logprob) return diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 58220b1d6..3a81a3636 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -782,15 +782,17 @@ class TokenizerManager: ): raise ValueError(finish_reason["message"]) - if ( - finish_reason.get("type") == "abort" - and finish_reason.get("status_code") - == HTTPStatus.SERVICE_UNAVAILABLE + if finish_reason.get("type") == "abort" and finish_reason.get( + "status_code" + ) in ( + HTTPStatus.SERVICE_UNAVAILABLE, + HTTPStatus.INTERNAL_SERVER_ERROR, ): # This is an abort request initiated by scheduler. # Delete the key to prevent resending abort request to the scheduler and # to ensure aborted request state is cleaned up. - del self.rid_to_state[state.obj.rid] + if state.obj.rid in self.rid_to_state: + del self.rid_to_state[state.obj.rid] # Mark ongoing LoRA request as finished. if self.server_args.enable_lora and state.obj.lora_path: