Add minimal vLLM 0.16.1 build repo for BI-V150
This commit is contained in:
29
vllm/entrypoints/serve/instrumentator/__init__.py
Normal file
29
vllm/entrypoints/serve/instrumentator/__init__.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
||||
from vllm import envs
|
||||
|
||||
|
||||
def register_instrumentator_api_routers(app: FastAPI):
|
||||
from .basic import router as basic_router
|
||||
|
||||
app.include_router(basic_router)
|
||||
|
||||
from .health import router as health_router
|
||||
|
||||
app.include_router(health_router)
|
||||
|
||||
from .metrics import attach_router as metrics_attach_router
|
||||
|
||||
metrics_attach_router(app)
|
||||
|
||||
from .offline_docs import attach_router as offline_docs_attach_router
|
||||
|
||||
offline_docs_attach_router(app)
|
||||
|
||||
if envs.VLLM_SERVER_DEV_MODE:
|
||||
from .server_info import router as server_info_router
|
||||
|
||||
app.include_router(server_info_router)
|
||||
57
vllm/entrypoints/serve/instrumentator/basic.py
Normal file
57
vllm/entrypoints/serve/instrumentator/basic.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from fastapi import APIRouter, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.openai.engine.serving import OpenAIServing
|
||||
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
|
||||
from vllm.logger import init_logger
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def base(request: Request) -> OpenAIServing:
|
||||
# Reuse the existing instance
|
||||
return tokenization(request)
|
||||
|
||||
|
||||
def tokenization(request: Request) -> OpenAIServingTokenization:
|
||||
return request.app.state.openai_serving_tokenization
|
||||
|
||||
|
||||
def engine_client(request: Request) -> EngineClient:
|
||||
return request.app.state.engine_client
|
||||
|
||||
|
||||
@router.get("/load")
|
||||
async def get_server_load_metrics(request: Request):
|
||||
# This endpoint returns the current server load metrics.
|
||||
# It tracks requests utilizing the GPU from the following routes:
|
||||
# - /v1/responses
|
||||
# - /v1/responses/{response_id}
|
||||
# - /v1/responses/{response_id}/cancel
|
||||
# - /v1/messages
|
||||
# - /v1/chat/completions
|
||||
# - /v1/completions
|
||||
# - /v1/audio/transcriptions
|
||||
# - /v1/audio/translations
|
||||
# - /v1/embeddings
|
||||
# - /pooling
|
||||
# - /classify
|
||||
# - /score
|
||||
# - /v1/score
|
||||
# - /rerank
|
||||
# - /v1/rerank
|
||||
# - /v2/rerank
|
||||
return JSONResponse(content={"server_load": request.app.state.server_load_metrics})
|
||||
|
||||
|
||||
@router.get("/version")
|
||||
async def show_version():
|
||||
ver = {"version": VLLM_VERSION}
|
||||
return JSONResponse(content=ver)
|
||||
29
vllm/entrypoints/serve/instrumentator/health.py
Normal file
29
vllm/entrypoints/serve/instrumentator/health.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from fastapi import APIRouter, Request
|
||||
from fastapi.responses import Response
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.engine.exceptions import EngineDeadError
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def engine_client(request: Request) -> EngineClient:
|
||||
return request.app.state.engine_client
|
||||
|
||||
|
||||
@router.get("/health", response_class=Response)
|
||||
async def health(raw_request: Request) -> Response:
|
||||
"""Health check."""
|
||||
try:
|
||||
await engine_client(raw_request).check_health()
|
||||
return Response(status_code=200)
|
||||
except EngineDeadError:
|
||||
return Response(status_code=503)
|
||||
45
vllm/entrypoints/serve/instrumentator/metrics.py
Normal file
45
vllm/entrypoints/serve/instrumentator/metrics.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import prometheus_client
|
||||
import regex as re
|
||||
from fastapi import FastAPI, Response
|
||||
from prometheus_client import make_asgi_app
|
||||
from prometheus_fastapi_instrumentator import Instrumentator
|
||||
from starlette.routing import Mount
|
||||
|
||||
from vllm.v1.metrics.prometheus import get_prometheus_registry
|
||||
|
||||
|
||||
class PrometheusResponse(Response):
|
||||
media_type = prometheus_client.CONTENT_TYPE_LATEST
|
||||
|
||||
|
||||
def attach_router(app: FastAPI):
|
||||
"""Mount prometheus metrics to a FastAPI app."""
|
||||
|
||||
registry = get_prometheus_registry()
|
||||
|
||||
# `response_class=PrometheusResponse` is needed to return an HTTP response
|
||||
# with header "Content-Type: text/plain; version=0.0.4; charset=utf-8"
|
||||
# instead of the default "application/json" which is incorrect.
|
||||
# See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364
|
||||
Instrumentator(
|
||||
excluded_handlers=[
|
||||
"/metrics",
|
||||
"/health",
|
||||
"/load",
|
||||
"/ping",
|
||||
"/version",
|
||||
"/server_info",
|
||||
],
|
||||
registry=registry,
|
||||
).add().instrument(app).expose(app, response_class=PrometheusResponse)
|
||||
|
||||
# Add prometheus asgi middleware to route /metrics requests
|
||||
metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
|
||||
|
||||
# Workaround for 307 Redirect for /metrics
|
||||
metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
|
||||
app.routes.append(metrics_route)
|
||||
50
vllm/entrypoints/serve/instrumentator/offline_docs.py
Normal file
50
vllm/entrypoints/serve/instrumentator/offline_docs.py
Normal file
@@ -0,0 +1,50 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Offline FastAPI documentation support for air-gapped environments."""
|
||||
|
||||
import pathlib
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi.openapi.docs import (
|
||||
get_swagger_ui_html,
|
||||
get_swagger_ui_oauth2_redirect_html,
|
||||
)
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def attach_router(app: FastAPI) -> None:
|
||||
"""Attach offline docs router if enabled via args."""
|
||||
args = getattr(app.state, "args", None)
|
||||
if args is None or not getattr(args, "enable_offline_docs", False):
|
||||
return
|
||||
|
||||
static_dir = pathlib.Path(__file__).parent / "static"
|
||||
|
||||
if not static_dir.exists():
|
||||
logger.warning(
|
||||
"Static directory not found at %s. Offline docs will not be available.",
|
||||
static_dir,
|
||||
)
|
||||
return
|
||||
|
||||
app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
|
||||
|
||||
@app.get("/docs", include_in_schema=False)
|
||||
async def custom_swagger_ui_html():
|
||||
return get_swagger_ui_html(
|
||||
openapi_url=app.openapi_url,
|
||||
title=app.title + " - Swagger UI",
|
||||
oauth2_redirect_url=app.swagger_ui_oauth2_redirect_url,
|
||||
swagger_js_url="/static/swagger-ui-bundle.js",
|
||||
swagger_css_url="/static/swagger-ui.css",
|
||||
)
|
||||
|
||||
@app.get(app.swagger_ui_oauth2_redirect_url, include_in_schema=False)
|
||||
async def swagger_ui_redirect():
|
||||
return get_swagger_ui_oauth2_redirect_html()
|
||||
|
||||
logger.info("Offline documentation enabled with vendored static assets")
|
||||
59
vllm/entrypoints/serve/instrumentator/server_info.py
Normal file
59
vllm/entrypoints/serve/instrumentator/server_info.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
from typing import Annotated, Literal
|
||||
|
||||
import pydantic
|
||||
from fastapi import APIRouter, Query, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.collect_env import get_env_info
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
|
||||
|
||||
|
||||
def _get_vllm_env_vars():
|
||||
from vllm.config.utils import normalize_value
|
||||
|
||||
vllm_envs = {}
|
||||
for key in dir(envs):
|
||||
if key.startswith("VLLM_") and "KEY" not in key:
|
||||
value = getattr(envs, key, None)
|
||||
if value is not None:
|
||||
value = normalize_value(value)
|
||||
vllm_envs[key] = value
|
||||
return vllm_envs
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def _get_system_env_info_cached():
|
||||
return get_env_info()._asdict()
|
||||
|
||||
|
||||
@router.get("/server_info")
|
||||
async def show_server_info(
|
||||
raw_request: Request,
|
||||
config_format: Annotated[Literal["text", "json"], Query()] = "text",
|
||||
):
|
||||
vllm_config: VllmConfig = raw_request.app.state.vllm_config
|
||||
server_info = {
|
||||
"vllm_config": (
|
||||
str(vllm_config)
|
||||
if config_format == "text"
|
||||
else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
|
||||
),
|
||||
# fallback=str is needed to handle e.g. torch.dtype
|
||||
"vllm_env": _get_vllm_env_vars(),
|
||||
"system_env": await asyncio.to_thread(_get_system_env_info_cached),
|
||||
}
|
||||
return JSONResponse(content=server_info)
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user