[PD] Propagate internal server errors from aborted requests to clients instead of blindly returning 200's (#8936)

This commit is contained in:
datdo-msft
2025-08-18 14:23:46 -07:00
committed by GitHub
parent 6805f6da40
commit 98b44e9e56
4 changed files with 10 additions and 8 deletions

View File

@@ -782,15 +782,17 @@ class TokenizerManager:
):
raise ValueError(finish_reason["message"])
if (
finish_reason.get("type") == "abort"
and finish_reason.get("status_code")
== HTTPStatus.SERVICE_UNAVAILABLE
if finish_reason.get("type") == "abort" and finish_reason.get(
"status_code"
) in (
HTTPStatus.SERVICE_UNAVAILABLE,
HTTPStatus.INTERNAL_SERVER_ERROR,
):
# This is an abort request initiated by scheduler.
# Delete the key to prevent resending abort request to the scheduler and
# to ensure aborted request state is cleaned up.
del self.rid_to_state[state.obj.rid]
if state.obj.rid in self.rid_to_state:
del self.rid_to_state[state.obj.rid]
# Mark ongoing LoRA request as finished.
if self.server_args.enable_lora and state.obj.lora_path: