Sync from v0.13
This commit is contained in:
49
vllm/entrypoints/serve/elastic_ep/middleware.py
Normal file
49
vllm/entrypoints/serve/elastic_ep/middleware.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Awaitable
|
||||
|
||||
from fastapi.responses import JSONResponse
|
||||
from starlette.types import ASGIApp, Receive, Scope, Send
|
||||
|
||||
# Global variable to track scaling state
|
||||
_scaling_elastic_ep = False
|
||||
|
||||
|
||||
def get_scaling_elastic_ep():
|
||||
return _scaling_elastic_ep
|
||||
|
||||
|
||||
def set_scaling_elastic_ep(value):
|
||||
global _scaling_elastic_ep
|
||||
_scaling_elastic_ep = value
|
||||
|
||||
|
||||
class ScalingMiddleware:
|
||||
"""
|
||||
Middleware that checks if the model is currently scaling and
|
||||
returns a 503 Service Unavailable response if it is.
|
||||
|
||||
This middleware applies to all HTTP requests and prevents
|
||||
processing when the model is in a scaling state.
|
||||
"""
|
||||
|
||||
def __init__(self, app: ASGIApp) -> None:
|
||||
self.app = app
|
||||
|
||||
def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
|
||||
if scope["type"] != "http":
|
||||
return self.app(scope, receive, send)
|
||||
|
||||
# Check global scaling state
|
||||
if get_scaling_elastic_ep():
|
||||
# Return 503 Service Unavailable response
|
||||
response = JSONResponse(
|
||||
content={
|
||||
"error": "The model is currently scaling. Please try again later."
|
||||
},
|
||||
status_code=503,
|
||||
)
|
||||
return response(scope, receive, send)
|
||||
|
||||
return self.app(scope, receive, send)
|
||||
Reference in New Issue
Block a user