58 lines
1.6 KiB
Python
58 lines
1.6 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
from fastapi import APIRouter, Request
|
|
from fastapi.responses import JSONResponse
|
|
|
|
from vllm.engine.protocol import EngineClient
|
|
from vllm.entrypoints.openai.engine.serving import OpenAIServing
|
|
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
|
|
from vllm.logger import init_logger
|
|
from vllm.version import __version__ as VLLM_VERSION
|
|
|
|
router = APIRouter()
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
|
|
def base(request: Request) -> OpenAIServing:
|
|
# Reuse the existing instance
|
|
return tokenization(request)
|
|
|
|
|
|
def tokenization(request: Request) -> OpenAIServingTokenization:
|
|
return request.app.state.openai_serving_tokenization
|
|
|
|
|
|
def engine_client(request: Request) -> EngineClient:
|
|
return request.app.state.engine_client
|
|
|
|
|
|
@router.get("/load")
|
|
async def get_server_load_metrics(request: Request):
|
|
# This endpoint returns the current server load metrics.
|
|
# It tracks requests utilizing the GPU from the following routes:
|
|
# - /v1/responses
|
|
# - /v1/responses/{response_id}
|
|
# - /v1/responses/{response_id}/cancel
|
|
# - /v1/messages
|
|
# - /v1/chat/completions
|
|
# - /v1/completions
|
|
# - /v1/audio/transcriptions
|
|
# - /v1/audio/translations
|
|
# - /v1/embeddings
|
|
# - /pooling
|
|
# - /classify
|
|
# - /score
|
|
# - /v1/score
|
|
# - /rerank
|
|
# - /v1/rerank
|
|
# - /v2/rerank
|
|
return JSONResponse(content={"server_load": request.app.state.server_load_metrics})
|
|
|
|
|
|
@router.get("/version")
|
|
async def show_version():
|
|
ver = {"version": VLLM_VERSION}
|
|
return JSONResponse(content=ver)
|