Add minimal vLLM 0.16.1 build repo for BI-V150

This commit is contained in:
2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions

View File

@@ -0,0 +1,29 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from fastapi import FastAPI
from vllm import envs
def register_instrumentator_api_routers(app: FastAPI):
from .basic import router as basic_router
app.include_router(basic_router)
from .health import router as health_router
app.include_router(health_router)
from .metrics import attach_router as metrics_attach_router
metrics_attach_router(app)
from .offline_docs import attach_router as offline_docs_attach_router
offline_docs_attach_router(app)
if envs.VLLM_SERVER_DEV_MODE:
from .server_info import router as server_info_router
app.include_router(server_info_router)

View File

@@ -0,0 +1,57 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from fastapi import APIRouter, Request
from fastapi.responses import JSONResponse
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
from vllm.logger import init_logger
from vllm.version import __version__ as VLLM_VERSION
router = APIRouter()
logger = init_logger(__name__)
def base(request: Request) -> OpenAIServing:
# Reuse the existing instance
return tokenization(request)
def tokenization(request: Request) -> OpenAIServingTokenization:
return request.app.state.openai_serving_tokenization
def engine_client(request: Request) -> EngineClient:
return request.app.state.engine_client
@router.get("/load")
async def get_server_load_metrics(request: Request):
# This endpoint returns the current server load metrics.
# It tracks requests utilizing the GPU from the following routes:
# - /v1/responses
# - /v1/responses/{response_id}
# - /v1/responses/{response_id}/cancel
# - /v1/messages
# - /v1/chat/completions
# - /v1/completions
# - /v1/audio/transcriptions
# - /v1/audio/translations
# - /v1/embeddings
# - /pooling
# - /classify
# - /score
# - /v1/score
# - /rerank
# - /v1/rerank
# - /v2/rerank
return JSONResponse(content={"server_load": request.app.state.server_load_metrics})
@router.get("/version")
async def show_version():
ver = {"version": VLLM_VERSION}
return JSONResponse(content=ver)

View File

@@ -0,0 +1,29 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from fastapi import APIRouter, Request
from fastapi.responses import Response
from vllm.engine.protocol import EngineClient
from vllm.logger import init_logger
from vllm.v1.engine.exceptions import EngineDeadError
logger = init_logger(__name__)
router = APIRouter()
def engine_client(request: Request) -> EngineClient:
return request.app.state.engine_client
@router.get("/health", response_class=Response)
async def health(raw_request: Request) -> Response:
"""Health check."""
try:
await engine_client(raw_request).check_health()
return Response(status_code=200)
except EngineDeadError:
return Response(status_code=503)

View File

@@ -0,0 +1,45 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import prometheus_client
import regex as re
from fastapi import FastAPI, Response
from prometheus_client import make_asgi_app
from prometheus_fastapi_instrumentator import Instrumentator
from starlette.routing import Mount
from vllm.v1.metrics.prometheus import get_prometheus_registry
class PrometheusResponse(Response):
media_type = prometheus_client.CONTENT_TYPE_LATEST
def attach_router(app: FastAPI):
"""Mount prometheus metrics to a FastAPI app."""
registry = get_prometheus_registry()
# `response_class=PrometheusResponse` is needed to return an HTTP response
# with header "Content-Type: text/plain; version=0.0.4; charset=utf-8"
# instead of the default "application/json" which is incorrect.
# See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364
Instrumentator(
excluded_handlers=[
"/metrics",
"/health",
"/load",
"/ping",
"/version",
"/server_info",
],
registry=registry,
).add().instrument(app).expose(app, response_class=PrometheusResponse)
# Add prometheus asgi middleware to route /metrics requests
metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
# Workaround for 307 Redirect for /metrics
metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
app.routes.append(metrics_route)

View File

@@ -0,0 +1,50 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Offline FastAPI documentation support for air-gapped environments."""
import pathlib
from fastapi import FastAPI
from fastapi.openapi.docs import (
get_swagger_ui_html,
get_swagger_ui_oauth2_redirect_html,
)
from fastapi.staticfiles import StaticFiles
from vllm.logger import init_logger
logger = init_logger(__name__)
def attach_router(app: FastAPI) -> None:
"""Attach offline docs router if enabled via args."""
args = getattr(app.state, "args", None)
if args is None or not getattr(args, "enable_offline_docs", False):
return
static_dir = pathlib.Path(__file__).parent / "static"
if not static_dir.exists():
logger.warning(
"Static directory not found at %s. Offline docs will not be available.",
static_dir,
)
return
app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
@app.get("/docs", include_in_schema=False)
async def custom_swagger_ui_html():
return get_swagger_ui_html(
openapi_url=app.openapi_url,
title=app.title + " - Swagger UI",
oauth2_redirect_url=app.swagger_ui_oauth2_redirect_url,
swagger_js_url="/static/swagger-ui-bundle.js",
swagger_css_url="/static/swagger-ui.css",
)
@app.get(app.swagger_ui_oauth2_redirect_url, include_in_schema=False)
async def swagger_ui_redirect():
return get_swagger_ui_oauth2_redirect_html()
logger.info("Offline documentation enabled with vendored static assets")

View File

@@ -0,0 +1,59 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import functools
from typing import Annotated, Literal
import pydantic
from fastapi import APIRouter, Query, Request
from fastapi.responses import JSONResponse
import vllm.envs as envs
from vllm.collect_env import get_env_info
from vllm.config import VllmConfig
from vllm.logger import init_logger
logger = init_logger(__name__)
router = APIRouter()
PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
def _get_vllm_env_vars():
from vllm.config.utils import normalize_value
vllm_envs = {}
for key in dir(envs):
if key.startswith("VLLM_") and "KEY" not in key:
value = getattr(envs, key, None)
if value is not None:
value = normalize_value(value)
vllm_envs[key] = value
return vllm_envs
@functools.lru_cache(maxsize=1)
def _get_system_env_info_cached():
return get_env_info()._asdict()
@router.get("/server_info")
async def show_server_info(
raw_request: Request,
config_format: Annotated[Literal["text", "json"], Query()] = "text",
):
vllm_config: VllmConfig = raw_request.app.state.vllm_config
server_info = {
"vllm_config": (
str(vllm_config)
if config_format == "text"
else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
),
# fallback=str is needed to handle e.g. torch.dtype
"vllm_env": _get_vllm_env_vars(),
"system_env": await asyncio.to_thread(_get_system_env_info_cached),
}
return JSONResponse(content=server_info)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long