Add minimal vLLM 0.16.1 build repo for BI-V150

2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions
--- a/vllm/entrypoints/serve/instrumentator/init.py
+++ b/vllm/entrypoints/serve/instrumentator/init.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from fastapi import FastAPI
+
+from vllm import envs
+
+
+def register_instrumentator_api_routers(app: FastAPI):
+    from .basic import router as basic_router
+
+    app.include_router(basic_router)
+
+    from .health import router as health_router
+
+    app.include_router(health_router)
+
+    from .metrics import attach_router as metrics_attach_router
+
+    metrics_attach_router(app)
+
+    from .offline_docs import attach_router as offline_docs_attach_router
+
+    offline_docs_attach_router(app)
+
+    if envs.VLLM_SERVER_DEV_MODE:
+        from .server_info import router as server_info_router
+
+        app.include_router(server_info_router)
--- a/vllm/entrypoints/serve/instrumentator/basic.py
+++ b/vllm/entrypoints/serve/instrumentator/basic.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from fastapi import APIRouter, Request
+from fastapi.responses import JSONResponse
+
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.openai.engine.serving import OpenAIServing
+from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
+from vllm.logger import init_logger
+from vllm.version import __version__ as VLLM_VERSION
+
+router = APIRouter()
+
+logger = init_logger(__name__)
+
+
+def base(request: Request) -> OpenAIServing:
+    # Reuse the existing instance
+    return tokenization(request)
+
+
+def tokenization(request: Request) -> OpenAIServingTokenization:
+    return request.app.state.openai_serving_tokenization
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.get("/load")
+async def get_server_load_metrics(request: Request):
+    # This endpoint returns the current server load metrics.
+    # It tracks requests utilizing the GPU from the following routes:
+    # - /v1/responses
+    # - /v1/responses/{response_id}
+    # - /v1/responses/{response_id}/cancel
+    # - /v1/messages
+    # - /v1/chat/completions
+    # - /v1/completions
+    # - /v1/audio/transcriptions
+    # - /v1/audio/translations
+    # - /v1/embeddings
+    # - /pooling
+    # - /classify
+    # - /score
+    # - /v1/score
+    # - /rerank
+    # - /v1/rerank
+    # - /v2/rerank
+    return JSONResponse(content={"server_load": request.app.state.server_load_metrics})
+
+
+@router.get("/version")
+async def show_version():
+    ver = {"version": VLLM_VERSION}
+    return JSONResponse(content=ver)
--- a/vllm/entrypoints/serve/instrumentator/health.py
+++ b/vllm/entrypoints/serve/instrumentator/health.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from fastapi import APIRouter, Request
+from fastapi.responses import Response
+
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+from vllm.v1.engine.exceptions import EngineDeadError
+
+logger = init_logger(__name__)
+
+
+router = APIRouter()
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.get("/health", response_class=Response)
+async def health(raw_request: Request) -> Response:
+    """Health check."""
+    try:
+        await engine_client(raw_request).check_health()
+        return Response(status_code=200)
+    except EngineDeadError:
+        return Response(status_code=503)
--- a/vllm/entrypoints/serve/instrumentator/metrics.py
+++ b/vllm/entrypoints/serve/instrumentator/metrics.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import prometheus_client
+import regex as re
+from fastapi import FastAPI, Response
+from prometheus_client import make_asgi_app
+from prometheus_fastapi_instrumentator import Instrumentator
+from starlette.routing import Mount
+
+from vllm.v1.metrics.prometheus import get_prometheus_registry
+
+
+class PrometheusResponse(Response):
+    media_type = prometheus_client.CONTENT_TYPE_LATEST
+
+
+def attach_router(app: FastAPI):
+    """Mount prometheus metrics to a FastAPI app."""
+
+    registry = get_prometheus_registry()
+
+    # `response_class=PrometheusResponse` is needed to return an HTTP response
+    # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8"
+    # instead of the default "application/json" which is incorrect.
+    # See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364
+    Instrumentator(
+        excluded_handlers=[
+            "/metrics",
+            "/health",
+            "/load",
+            "/ping",
+            "/version",
+            "/server_info",
+        ],
+        registry=registry,
+    ).add().instrument(app).expose(app, response_class=PrometheusResponse)
+
+    # Add prometheus asgi middleware to route /metrics requests
+    metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
+
+    # Workaround for 307 Redirect for /metrics
+    metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
+    app.routes.append(metrics_route)
--- a/vllm/entrypoints/serve/instrumentator/offline_docs.py
+++ b/vllm/entrypoints/serve/instrumentator/offline_docs.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Offline FastAPI documentation support for air-gapped environments."""
+
+import pathlib
+
+from fastapi import FastAPI
+from fastapi.openapi.docs import (
+    get_swagger_ui_html,
+    get_swagger_ui_oauth2_redirect_html,
+)
+from fastapi.staticfiles import StaticFiles
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def attach_router(app: FastAPI) -> None:
+    """Attach offline docs router if enabled via args."""
+    args = getattr(app.state, "args", None)
+    if args is None or not getattr(args, "enable_offline_docs", False):
+        return
+
+    static_dir = pathlib.Path(__file__).parent / "static"
+
+    if not static_dir.exists():
+        logger.warning(
+            "Static directory not found at %s. Offline docs will not be available.",
+            static_dir,
+        )
+        return
+
+    app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
+
+    @app.get("/docs", include_in_schema=False)
+    async def custom_swagger_ui_html():
+        return get_swagger_ui_html(
+            openapi_url=app.openapi_url,
+            title=app.title + " - Swagger UI",
+            oauth2_redirect_url=app.swagger_ui_oauth2_redirect_url,
+            swagger_js_url="/static/swagger-ui-bundle.js",
+            swagger_css_url="/static/swagger-ui.css",
+        )
+
+    @app.get(app.swagger_ui_oauth2_redirect_url, include_in_schema=False)
+    async def swagger_ui_redirect():
+        return get_swagger_ui_oauth2_redirect_html()
+
+    logger.info("Offline documentation enabled with vendored static assets")
--- a/vllm/entrypoints/serve/instrumentator/server_info.py
+++ b/vllm/entrypoints/serve/instrumentator/server_info.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import asyncio
+import functools
+from typing import Annotated, Literal
+
+import pydantic
+from fastapi import APIRouter, Query, Request
+from fastapi.responses import JSONResponse
+
+import vllm.envs as envs
+from vllm.collect_env import get_env_info
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+router = APIRouter()
+PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
+
+
+def _get_vllm_env_vars():
+    from vllm.config.utils import normalize_value
+
+    vllm_envs = {}
+    for key in dir(envs):
+        if key.startswith("VLLM_") and "KEY" not in key:
+            value = getattr(envs, key, None)
+            if value is not None:
+                value = normalize_value(value)
+                vllm_envs[key] = value
+    return vllm_envs
+
+
+@functools.lru_cache(maxsize=1)
+def _get_system_env_info_cached():
+    return get_env_info()._asdict()
+
+
+@router.get("/server_info")
+async def show_server_info(
+    raw_request: Request,
+    config_format: Annotated[Literal["text", "json"], Query()] = "text",
+):
+    vllm_config: VllmConfig = raw_request.app.state.vllm_config
+    server_info = {
+        "vllm_config": (
+            str(vllm_config)
+            if config_format == "text"
+            else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
+        ),
+        # fallback=str is needed to handle e.g. torch.dtype
+        "vllm_env": _get_vllm_env_vars(),
+        "system_env": await asyncio.to_thread(_get_system_env_info_cached),
+    }
+    return JSONResponse(content=server_info)
--- a/vllm/entrypoints/serve/instrumentator/static/swagger-ui-bundle.js
+++ b/vllm/entrypoints/serve/instrumentator/static/swagger-ui-bundle.js
--- a/vllm/entrypoints/serve/instrumentator/static/swagger-ui.css
+++ b/vllm/entrypoints/serve/instrumentator/static/swagger-ui.css