feat: throttle requests at scheduler based on --max_queued_requests (#7565)

This commit is contained in:
harrisonlimh
2025-07-28 07:32:33 -07:00
committed by GitHub
parent b582159246
commit 747dd45077
10 changed files with 218 additions and 6 deletions

View File

@@ -38,7 +38,7 @@ import orjson
import requests
import uvicorn
import uvloop
from fastapi import Depends, FastAPI, Request, UploadFile
from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import ORJSONResponse, Response, StreamingResponse
@@ -174,6 +174,18 @@ app.add_middleware(
)
@app.exception_handler(HTTPException)
async def validation_exception_handler(request: Request, exc: HTTPException):
"""Enrich HTTP exception with status code and other details"""
error = ErrorResponse(
object="error",
message=exc.detail,
type=str(exc.status_code),
code=exc.status_code,
)
return ORJSONResponse(content=error.model_dump(), status_code=exc.status_code)
# Custom exception handlers to change validation error status codes
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, exc: RequestValidationError):

View File

@@ -4,7 +4,7 @@ import uuid
from abc import ABC, abstractmethod
from typing import Any, Optional, Union
from fastapi import Request
from fastapi import HTTPException, Request
from fastapi.responses import ORJSONResponse, StreamingResponse
from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
@@ -45,7 +45,10 @@ class OpenAIServingBase(ABC):
return await self._handle_non_streaming_request(
adapted_request, processed_request, raw_request
)
except HTTPException as e:
return self.create_error_response(
message=e.detail, err_type=str(e.status_code), status_code=e.status_code
)
except Exception as e:
logger.exception(f"Error in request: {e}")
return self.create_error_response(