50 lines
1.4 KiB
Python
50 lines
1.4 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
from collections.abc import Awaitable
|
|
|
|
from fastapi.responses import JSONResponse
|
|
from starlette.types import ASGIApp, Receive, Scope, Send
|
|
|
|
# Global variable to track scaling state
|
|
_scaling_elastic_ep = False
|
|
|
|
|
|
def get_scaling_elastic_ep():
|
|
return _scaling_elastic_ep
|
|
|
|
|
|
def set_scaling_elastic_ep(value):
|
|
global _scaling_elastic_ep
|
|
_scaling_elastic_ep = value
|
|
|
|
|
|
class ScalingMiddleware:
|
|
"""
|
|
Middleware that checks if the model is currently scaling and
|
|
returns a 503 Service Unavailable response if it is.
|
|
|
|
This middleware applies to all HTTP requests and prevents
|
|
processing when the model is in a scaling state.
|
|
"""
|
|
|
|
def __init__(self, app: ASGIApp) -> None:
|
|
self.app = app
|
|
|
|
def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
|
|
if scope["type"] != "http":
|
|
return self.app(scope, receive, send)
|
|
|
|
# Check global scaling state
|
|
if get_scaling_elastic_ep():
|
|
# Return 503 Service Unavailable response
|
|
response = JSONResponse(
|
|
content={
|
|
"error": "The model is currently scaling. Please try again later."
|
|
},
|
|
status_code=503,
|
|
)
|
|
return response(scope, receive, send)
|
|
|
|
return self.app(scope, receive, send)
|