Add minimal vLLM 0.16.1 build repo for BI-V150

2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions
--- a/vllm/tracing/init.py
+++ b/vllm/tracing/init.py
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+from collections.abc import Callable
+from typing import Any, TypeAlias
+
+# Import the implementation details
+from .otel import (
+    SpanKind,
+    extract_trace_context,
+    init_otel_tracer,
+    init_otel_worker_tracer,
+    instrument_otel,
+    is_otel_available,
+    manual_instrument_otel,
+    otel_import_error_traceback,
+)
+from .utils import (
+    SpanAttributes,
+    contains_trace_headers,
+    extract_trace_headers,
+    log_tracing_disabled_warning,
+)
+
+__all__ = [
+    "instrument",
+    "instrument_manual",
+    "init_tracer",
+    "maybe_init_worker_tracer",
+    "is_tracing_available",
+    "SpanAttributes",
+    "SpanKind",
+    "extract_trace_context",
+    "extract_trace_headers",
+    "log_tracing_disabled_warning",
+    "contains_trace_headers",
+    "otel_import_error_traceback",
+]
+
+BackendAvailableFunc: TypeAlias = Callable[[], bool]
+InstrumentFunc: TypeAlias = Callable[..., Any]
+InstrumentManualFunc: TypeAlias = Callable[..., Any]
+InitTracerFunc: TypeAlias = Callable[..., Any]
+InitWorkerTracerFunc: TypeAlias = Callable[..., Any]
+_REGISTERED_TRACING_BACKENDS: dict[
+    str,
+    tuple[
+        BackendAvailableFunc,
+        InitTracerFunc,
+        InitWorkerTracerFunc,
+        InstrumentFunc,
+        InstrumentManualFunc,
+    ],
+] = {
+    "otel": (
+        is_otel_available,
+        init_otel_tracer,
+        init_otel_worker_tracer,
+        instrument_otel,
+        manual_instrument_otel,
+    ),
+}
+
+
+def init_tracer(
+    instrumenting_module_name: str,
+    otlp_traces_endpoint: str,
+    extra_attributes: dict[str, str] | None = None,
+):
+    is_available, init_tracer_fn, _, _, _ = _REGISTERED_TRACING_BACKENDS["otel"]
+    if is_available():
+        return init_tracer_fn(
+            instrumenting_module_name, otlp_traces_endpoint, extra_attributes
+        )
+
+
+def maybe_init_worker_tracer(
+    instrumenting_module_name: str,
+    process_kind: str,
+    process_name: str,
+):
+    is_available, _, init_worker_tracer_fn, _, _ = _REGISTERED_TRACING_BACKENDS["otel"]
+    if is_available():
+        return init_worker_tracer_fn(
+            instrumenting_module_name, process_kind, process_name
+        )
+
+
+def instrument(
+    obj: Callable | None = None,
+    *,
+    span_name: str = "",
+    attributes: dict[str, str] | None = None,
+    record_exception: bool = True,
+):
+    """
+    Generic decorator to instrument functions.
+    """
+    if obj is None:
+        return functools.partial(
+            instrument,
+            span_name=span_name,
+            attributes=attributes,
+            record_exception=record_exception,
+        )
+
+    # Dispatch to OTel (and potentially others later)
+    is_available, _, _, otel_instrument, _ = _REGISTERED_TRACING_BACKENDS["otel"]
+    if is_available():
+        return otel_instrument(
+            func=obj,
+            span_name=span_name,
+            attributes=attributes,
+            record_exception=record_exception,
+        )
+    else:
+        return obj
+
+
+def instrument_manual(
+    span_name: str,
+    start_time: int,
+    end_time: int | None = None,
+    attributes: dict[str, Any] | None = None,
+    context: Any = None,
+    kind: Any = None,
+):
+    """Manually create a span with explicit timestamps.
+
+    Args:
+        span_name: Name of the span to create.
+        start_time: Start time in nanoseconds since epoch.
+        end_time: Optional end time in nanoseconds. If None, ends immediately.
+        attributes: Optional dict of span attributes.
+        context: Optional trace context (e.g., from extract_trace_context).
+        kind: Optional SpanKind (e.g., SpanKind.SERVER).
+    """
+    is_available, _, _, _, manual_instrument_fn = _REGISTERED_TRACING_BACKENDS["otel"]
+    if is_available():
+        return manual_instrument_fn(
+            span_name, start_time, end_time, attributes, context, kind
+        )
+    else:
+        return None
+
+
+def is_tracing_available() -> bool:
+    """
+    Returns True if any tracing backend (OTel, Profiler, etc.) is available.
+    Use this to guard expensive tracing logic in the main code.
+    """
+    check_available = [
+        is_available
+        for is_available, _, _, _, _ in _REGISTERED_TRACING_BACKENDS.values()
+    ]
+    return any(check_available)
--- a/vllm/tracing/otel.py
+++ b/vllm/tracing/otel.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import atexit
+import functools
+import inspect
+import os
+import traceback
+from collections.abc import Mapping
+from contextlib import contextmanager
+from typing import Any
+
+from vllm.logger import init_logger
+from vllm.tracing.utils import TRACE_HEADERS, LoadingSpanAttributes
+
+logger = init_logger(__name__)
+
+try:
+    from opentelemetry import trace
+    from opentelemetry.context.context import Context
+    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+        OTLPSpanExporter as OTLPGrpcExporter,
+    )
+    from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
+        OTLPSpanExporter as OTLPHttpExporter,
+    )
+    from opentelemetry.propagate import inject
+    from opentelemetry.sdk.environment_variables import (
+        OTEL_EXPORTER_OTLP_TRACES_PROTOCOL,
+    )
+    from opentelemetry.sdk.resources import Resource
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry.trace import (
+        SpanKind,  # noqa: F401
+        Tracer,
+        set_tracer_provider,
+    )
+    from opentelemetry.trace.propagation.tracecontext import (
+        TraceContextTextMapPropagator,
+    )
+
+    _IS_OTEL_AVAILABLE = True
+    otel_import_error_traceback = None
+except ImportError:
+    _IS_OTEL_AVAILABLE = False
+    otel_import_error_traceback = traceback.format_exc()
+    trace = None  # type: ignore
+    Context = Any  # type: ignore
+    Tracer = Any  # type: ignore
+    inject = None  # type: ignore
+    Resource = None  # type: ignore
+    SpanKind = Any  # type: ignore
+
+
+def is_otel_available() -> bool:
+    return _IS_OTEL_AVAILABLE
+
+
+def init_otel_tracer(
+    instrumenting_module_name: str,
+    otlp_traces_endpoint: str,
+    extra_attributes: dict[str, str] | None = None,
+) -> Tracer:
+    """Initializes the OpenTelemetry tracer provider."""
+    if not _IS_OTEL_AVAILABLE:
+        raise ValueError(
+            "OpenTelemetry is not available. Unable to initialize "
+            "a tracer. Ensure OpenTelemetry packages are installed. "
+            f"Original error:\n{otel_import_error_traceback}"
+        )
+
+    # Store the endpoint in environment so child processes can inherit it
+    os.environ["OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"] = otlp_traces_endpoint
+
+    resource_attrs = {}
+    resource_attrs["vllm.instrumenting_module_name"] = instrumenting_module_name
+    resource_attrs["vllm.process_id"] = str(os.getpid())
+    if extra_attributes:
+        resource_attrs.update(extra_attributes)
+    resource = Resource.create(resource_attrs)
+
+    trace_provider = TracerProvider(resource=resource)
+    span_exporter = get_span_exporter(otlp_traces_endpoint)
+    trace_provider.add_span_processor(BatchSpanProcessor(span_exporter))
+    set_tracer_provider(trace_provider)
+
+    atexit.register(trace_provider.shutdown)
+
+    tracer = trace_provider.get_tracer(instrumenting_module_name)
+    return tracer
+
+
+def get_span_exporter(endpoint):
+    protocol = os.environ.get(OTEL_EXPORTER_OTLP_TRACES_PROTOCOL, "grpc")
+    if protocol == "grpc":
+        exporter = OTLPGrpcExporter(endpoint=endpoint, insecure=True)
+    elif protocol == "http/protobuf":
+        exporter = OTLPHttpExporter(endpoint=endpoint)
+    else:
+        raise ValueError(f"Unsupported OTLP protocol '{protocol}' is configured")
+    return exporter
+
+
+def init_otel_worker_tracer(
+    instrumenting_module_name: str,
+    process_kind: str,
+    process_name: str,
+) -> Tracer:
+    """
+    Backend-specific initialization for OpenTelemetry in a worker process.
+    """
+    # Initialize the tracer if an OTLP endpoint is configured.
+    # The endpoint is propagated via environment variable from the main process.
+    otlp_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT")
+    if not otlp_endpoint:
+        return None
+
+    extra_attrs = {
+        "vllm.process_kind": process_kind,
+        "vllm.process_name": process_name,
+    }
+
+    return init_otel_tracer(instrumenting_module_name, otlp_endpoint, extra_attrs)
+
+
+def extract_trace_context(headers: Mapping[str, str] | None) -> Context | None:
+    """Extracts context from HTTP headers."""
+    if _IS_OTEL_AVAILABLE and headers:
+        return TraceContextTextMapPropagator().extract(headers)
+    return None
+
+
+def instrument_otel(func, span_name, attributes, record_exception):
+    """Internal wrapper logic for sync and async functions."""
+
+    # Pre-calculate static code attributes once (these don't change)
+    code_attrs = {
+        LoadingSpanAttributes.CODE_FUNCTION: func.__qualname__,
+        LoadingSpanAttributes.CODE_NAMESPACE: func.__module__,
+        LoadingSpanAttributes.CODE_FILEPATH: func.__code__.co_filename,
+        LoadingSpanAttributes.CODE_LINENO: str(func.__code__.co_firstlineno),
+    }
+    if attributes:
+        code_attrs.update(attributes)
+
+    final_span_name = span_name or func.__qualname__
+    module_name = func.__module__
+
+    @functools.wraps(func)
+    async def async_wrapper(*args, **kwargs):
+        tracer = trace.get_tracer(module_name)
+        ctx = _get_smart_context()
+        with (
+            tracer.start_as_current_span(
+                final_span_name,
+                context=ctx,
+                attributes=code_attrs,
+                record_exception=record_exception,
+            ),
+            propagate_trace_to_env(),
+        ):
+            return await func(*args, **kwargs)
+
+    @functools.wraps(func)
+    def sync_wrapper(*args, **kwargs):
+        tracer = trace.get_tracer(module_name)
+        ctx = _get_smart_context()
+        with (
+            tracer.start_as_current_span(
+                final_span_name,
+                context=ctx,
+                attributes=code_attrs,
+                record_exception=record_exception,
+            ),
+            propagate_trace_to_env(),
+        ):
+            return func(*args, **kwargs)
+
+    return async_wrapper if inspect.iscoroutinefunction(func) else sync_wrapper
+
+
+def manual_instrument_otel(
+    span_name: str,
+    start_time: int,
+    end_time: int | None = None,
+    attributes: dict[str, Any] | None = None,
+    context: Context | None = None,
+    kind: Any = None,  # SpanKind, but typed as Any for when OTEL unavailable
+):
+    """Manually create and end a span with explicit timestamps."""
+    if not _IS_OTEL_AVAILABLE:
+        return
+
+    tracer = trace.get_tracer(__name__)
+    # Use provided context, or fall back to smart context detection
+    ctx = context if context is not None else _get_smart_context()
+
+    span_kwargs: dict[str, Any] = {
+        "name": span_name,
+        "context": ctx,
+        "start_time": start_time,
+    }
+    if kind is not None:
+        span_kwargs["kind"] = kind
+
+    span = tracer.start_span(**span_kwargs)
+    if attributes:
+        span.set_attributes(attributes)
+    if end_time is not None:
+        span.end(end_time=end_time)
+    else:
+        span.end()
+
+
+def _get_smart_context() -> Context | None:
+    """
+    Determines the parent context.
+    1. If a Span is already active in this process, use it.
+    2. If not, extract from os.environ, handling the case-sensitivity mismatch.
+    """
+    current_span = trace.get_current_span()
+    if current_span.get_span_context().is_valid:
+        return None
+
+    carrier = {}
+
+    if tp := os.environ.get("traceparent", os.environ.get("TRACEPARENT")):  # noqa: SIM112
+        carrier["traceparent"] = tp
+
+    if ts := os.environ.get("tracestate", os.environ.get("TRACESTATE")):  # noqa: SIM112
+        carrier["tracestate"] = ts
+
+    if not carrier:
+        carrier = dict(os.environ)
+
+    return TraceContextTextMapPropagator().extract(carrier)
+
+
+@contextmanager
+def propagate_trace_to_env():
+    """
+    Temporarily injects the current OTel context into os.environ.
+    This ensures that any subprocesses (like vLLM workers) spawned
+    within this context inherit the correct traceparent.
+    """
+    if not _IS_OTEL_AVAILABLE:
+        yield
+        return
+
+    # Capture original state of relevant keys
+    original_state = {k: os.environ.get(k) for k in TRACE_HEADERS}
+
+    try:
+        # inject() writes 'traceparent' and 'tracestate' to os.environ
+        inject(os.environ)
+        yield
+
+    finally:
+        # Restore original environment
+        for key, original_value in original_state.items():
+            if original_value is None:
+                os.environ.pop(key, None)
+            else:
+                os.environ[key] = original_value
--- a/vllm/tracing/utils.py
+++ b/vllm/tracing/utils.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Mapping
+
+from vllm.logger import init_logger
+from vllm.utils.func_utils import run_once
+
+logger = init_logger(__name__)
+
+# Standard W3C headers used for context propagation
+TRACE_HEADERS = ["traceparent", "tracestate"]
+
+
+class SpanAttributes:
+    """
+    Standard attributes for spans.
+
+    These are largely based on OpenTelemetry Semantic Conventions but are defined
+    here as constants so they can be used by any backend or logger.
+    """
+
+    # Attribute names copied from OTel semantic conventions to avoid version conflicts
+    GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens"
+    GEN_AI_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens"
+    GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
+    GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p"
+    GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
+    GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
+
+    # Custom attributes added until they are standardized
+    GEN_AI_REQUEST_ID = "gen_ai.request.id"
+    GEN_AI_REQUEST_N = "gen_ai.request.n"
+    GEN_AI_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
+    GEN_AI_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
+    GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
+    GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e"
+    GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
+
+    # Latency breakdowns
+    GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = "gen_ai.latency.time_in_model_forward"
+    GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = "gen_ai.latency.time_in_model_execute"
+    GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL = "gen_ai.latency.time_in_model_prefill"
+    GEN_AI_LATENCY_TIME_IN_MODEL_DECODE = "gen_ai.latency.time_in_model_decode"
+    GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE = "gen_ai.latency.time_in_model_inference"
+
+
+class LoadingSpanAttributes:
+    """Custom attributes for code-level tracing (file, line number)."""
+
+    CODE_NAMESPACE = "code.namespace"
+    CODE_FUNCTION = "code.function"
+    CODE_FILEPATH = "code.filepath"
+    CODE_LINENO = "code.lineno"
+
+
+def contains_trace_headers(headers: Mapping[str, str]) -> bool:
+    """Check if the provided headers dictionary contains trace context."""
+    return any(h in headers for h in TRACE_HEADERS)
+
+
+def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
+    """
+    Extract only trace-related headers from a larger header dictionary.
+    Useful for logging or passing context to a non-OTel client.
+    """
+    return {h: headers[h] for h in TRACE_HEADERS if h in headers}
+
+
+@run_once
+def log_tracing_disabled_warning() -> None:
+    logger.warning("Received a request with trace context but tracing is disabled")