Add minimal vLLM 0.16.1 build repo for BI-V150

This commit is contained in:
2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions

157
vllm/tracing/__init__.py Normal file
View File

@@ -0,0 +1,157 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import functools
from collections.abc import Callable
from typing import Any, TypeAlias
# Import the implementation details
from .otel import (
SpanKind,
extract_trace_context,
init_otel_tracer,
init_otel_worker_tracer,
instrument_otel,
is_otel_available,
manual_instrument_otel,
otel_import_error_traceback,
)
from .utils import (
SpanAttributes,
contains_trace_headers,
extract_trace_headers,
log_tracing_disabled_warning,
)
__all__ = [
"instrument",
"instrument_manual",
"init_tracer",
"maybe_init_worker_tracer",
"is_tracing_available",
"SpanAttributes",
"SpanKind",
"extract_trace_context",
"extract_trace_headers",
"log_tracing_disabled_warning",
"contains_trace_headers",
"otel_import_error_traceback",
]
BackendAvailableFunc: TypeAlias = Callable[[], bool]
InstrumentFunc: TypeAlias = Callable[..., Any]
InstrumentManualFunc: TypeAlias = Callable[..., Any]
InitTracerFunc: TypeAlias = Callable[..., Any]
InitWorkerTracerFunc: TypeAlias = Callable[..., Any]
_REGISTERED_TRACING_BACKENDS: dict[
str,
tuple[
BackendAvailableFunc,
InitTracerFunc,
InitWorkerTracerFunc,
InstrumentFunc,
InstrumentManualFunc,
],
] = {
"otel": (
is_otel_available,
init_otel_tracer,
init_otel_worker_tracer,
instrument_otel,
manual_instrument_otel,
),
}
def init_tracer(
instrumenting_module_name: str,
otlp_traces_endpoint: str,
extra_attributes: dict[str, str] | None = None,
):
is_available, init_tracer_fn, _, _, _ = _REGISTERED_TRACING_BACKENDS["otel"]
if is_available():
return init_tracer_fn(
instrumenting_module_name, otlp_traces_endpoint, extra_attributes
)
def maybe_init_worker_tracer(
instrumenting_module_name: str,
process_kind: str,
process_name: str,
):
is_available, _, init_worker_tracer_fn, _, _ = _REGISTERED_TRACING_BACKENDS["otel"]
if is_available():
return init_worker_tracer_fn(
instrumenting_module_name, process_kind, process_name
)
def instrument(
obj: Callable | None = None,
*,
span_name: str = "",
attributes: dict[str, str] | None = None,
record_exception: bool = True,
):
"""
Generic decorator to instrument functions.
"""
if obj is None:
return functools.partial(
instrument,
span_name=span_name,
attributes=attributes,
record_exception=record_exception,
)
# Dispatch to OTel (and potentially others later)
is_available, _, _, otel_instrument, _ = _REGISTERED_TRACING_BACKENDS["otel"]
if is_available():
return otel_instrument(
func=obj,
span_name=span_name,
attributes=attributes,
record_exception=record_exception,
)
else:
return obj
def instrument_manual(
span_name: str,
start_time: int,
end_time: int | None = None,
attributes: dict[str, Any] | None = None,
context: Any = None,
kind: Any = None,
):
"""Manually create a span with explicit timestamps.
Args:
span_name: Name of the span to create.
start_time: Start time in nanoseconds since epoch.
end_time: Optional end time in nanoseconds. If None, ends immediately.
attributes: Optional dict of span attributes.
context: Optional trace context (e.g., from extract_trace_context).
kind: Optional SpanKind (e.g., SpanKind.SERVER).
"""
is_available, _, _, _, manual_instrument_fn = _REGISTERED_TRACING_BACKENDS["otel"]
if is_available():
return manual_instrument_fn(
span_name, start_time, end_time, attributes, context, kind
)
else:
return None
def is_tracing_available() -> bool:
"""
Returns True if any tracing backend (OTel, Profiler, etc.) is available.
Use this to guard expensive tracing logic in the main code.
"""
check_available = [
is_available
for is_available, _, _, _, _ in _REGISTERED_TRACING_BACKENDS.values()
]
return any(check_available)

265
vllm/tracing/otel.py Normal file
View File

@@ -0,0 +1,265 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import atexit
import functools
import inspect
import os
import traceback
from collections.abc import Mapping
from contextlib import contextmanager
from typing import Any
from vllm.logger import init_logger
from vllm.tracing.utils import TRACE_HEADERS, LoadingSpanAttributes
logger = init_logger(__name__)
try:
from opentelemetry import trace
from opentelemetry.context.context import Context
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
OTLPSpanExporter as OTLPGrpcExporter,
)
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
OTLPSpanExporter as OTLPHttpExporter,
)
from opentelemetry.propagate import inject
from opentelemetry.sdk.environment_variables import (
OTEL_EXPORTER_OTLP_TRACES_PROTOCOL,
)
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.trace import (
SpanKind, # noqa: F401
Tracer,
set_tracer_provider,
)
from opentelemetry.trace.propagation.tracecontext import (
TraceContextTextMapPropagator,
)
_IS_OTEL_AVAILABLE = True
otel_import_error_traceback = None
except ImportError:
_IS_OTEL_AVAILABLE = False
otel_import_error_traceback = traceback.format_exc()
trace = None # type: ignore
Context = Any # type: ignore
Tracer = Any # type: ignore
inject = None # type: ignore
Resource = None # type: ignore
SpanKind = Any # type: ignore
def is_otel_available() -> bool:
return _IS_OTEL_AVAILABLE
def init_otel_tracer(
instrumenting_module_name: str,
otlp_traces_endpoint: str,
extra_attributes: dict[str, str] | None = None,
) -> Tracer:
"""Initializes the OpenTelemetry tracer provider."""
if not _IS_OTEL_AVAILABLE:
raise ValueError(
"OpenTelemetry is not available. Unable to initialize "
"a tracer. Ensure OpenTelemetry packages are installed. "
f"Original error:\n{otel_import_error_traceback}"
)
# Store the endpoint in environment so child processes can inherit it
os.environ["OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"] = otlp_traces_endpoint
resource_attrs = {}
resource_attrs["vllm.instrumenting_module_name"] = instrumenting_module_name
resource_attrs["vllm.process_id"] = str(os.getpid())
if extra_attributes:
resource_attrs.update(extra_attributes)
resource = Resource.create(resource_attrs)
trace_provider = TracerProvider(resource=resource)
span_exporter = get_span_exporter(otlp_traces_endpoint)
trace_provider.add_span_processor(BatchSpanProcessor(span_exporter))
set_tracer_provider(trace_provider)
atexit.register(trace_provider.shutdown)
tracer = trace_provider.get_tracer(instrumenting_module_name)
return tracer
def get_span_exporter(endpoint):
protocol = os.environ.get(OTEL_EXPORTER_OTLP_TRACES_PROTOCOL, "grpc")
if protocol == "grpc":
exporter = OTLPGrpcExporter(endpoint=endpoint, insecure=True)
elif protocol == "http/protobuf":
exporter = OTLPHttpExporter(endpoint=endpoint)
else:
raise ValueError(f"Unsupported OTLP protocol '{protocol}' is configured")
return exporter
def init_otel_worker_tracer(
instrumenting_module_name: str,
process_kind: str,
process_name: str,
) -> Tracer:
"""
Backend-specific initialization for OpenTelemetry in a worker process.
"""
# Initialize the tracer if an OTLP endpoint is configured.
# The endpoint is propagated via environment variable from the main process.
otlp_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT")
if not otlp_endpoint:
return None
extra_attrs = {
"vllm.process_kind": process_kind,
"vllm.process_name": process_name,
}
return init_otel_tracer(instrumenting_module_name, otlp_endpoint, extra_attrs)
def extract_trace_context(headers: Mapping[str, str] | None) -> Context | None:
"""Extracts context from HTTP headers."""
if _IS_OTEL_AVAILABLE and headers:
return TraceContextTextMapPropagator().extract(headers)
return None
def instrument_otel(func, span_name, attributes, record_exception):
"""Internal wrapper logic for sync and async functions."""
# Pre-calculate static code attributes once (these don't change)
code_attrs = {
LoadingSpanAttributes.CODE_FUNCTION: func.__qualname__,
LoadingSpanAttributes.CODE_NAMESPACE: func.__module__,
LoadingSpanAttributes.CODE_FILEPATH: func.__code__.co_filename,
LoadingSpanAttributes.CODE_LINENO: str(func.__code__.co_firstlineno),
}
if attributes:
code_attrs.update(attributes)
final_span_name = span_name or func.__qualname__
module_name = func.__module__
@functools.wraps(func)
async def async_wrapper(*args, **kwargs):
tracer = trace.get_tracer(module_name)
ctx = _get_smart_context()
with (
tracer.start_as_current_span(
final_span_name,
context=ctx,
attributes=code_attrs,
record_exception=record_exception,
),
propagate_trace_to_env(),
):
return await func(*args, **kwargs)
@functools.wraps(func)
def sync_wrapper(*args, **kwargs):
tracer = trace.get_tracer(module_name)
ctx = _get_smart_context()
with (
tracer.start_as_current_span(
final_span_name,
context=ctx,
attributes=code_attrs,
record_exception=record_exception,
),
propagate_trace_to_env(),
):
return func(*args, **kwargs)
return async_wrapper if inspect.iscoroutinefunction(func) else sync_wrapper
def manual_instrument_otel(
span_name: str,
start_time: int,
end_time: int | None = None,
attributes: dict[str, Any] | None = None,
context: Context | None = None,
kind: Any = None, # SpanKind, but typed as Any for when OTEL unavailable
):
"""Manually create and end a span with explicit timestamps."""
if not _IS_OTEL_AVAILABLE:
return
tracer = trace.get_tracer(__name__)
# Use provided context, or fall back to smart context detection
ctx = context if context is not None else _get_smart_context()
span_kwargs: dict[str, Any] = {
"name": span_name,
"context": ctx,
"start_time": start_time,
}
if kind is not None:
span_kwargs["kind"] = kind
span = tracer.start_span(**span_kwargs)
if attributes:
span.set_attributes(attributes)
if end_time is not None:
span.end(end_time=end_time)
else:
span.end()
def _get_smart_context() -> Context | None:
"""
Determines the parent context.
1. If a Span is already active in this process, use it.
2. If not, extract from os.environ, handling the case-sensitivity mismatch.
"""
current_span = trace.get_current_span()
if current_span.get_span_context().is_valid:
return None
carrier = {}
if tp := os.environ.get("traceparent", os.environ.get("TRACEPARENT")): # noqa: SIM112
carrier["traceparent"] = tp
if ts := os.environ.get("tracestate", os.environ.get("TRACESTATE")): # noqa: SIM112
carrier["tracestate"] = ts
if not carrier:
carrier = dict(os.environ)
return TraceContextTextMapPropagator().extract(carrier)
@contextmanager
def propagate_trace_to_env():
"""
Temporarily injects the current OTel context into os.environ.
This ensures that any subprocesses (like vLLM workers) spawned
within this context inherit the correct traceparent.
"""
if not _IS_OTEL_AVAILABLE:
yield
return
# Capture original state of relevant keys
original_state = {k: os.environ.get(k) for k in TRACE_HEADERS}
try:
# inject() writes 'traceparent' and 'tracestate' to os.environ
inject(os.environ)
yield
finally:
# Restore original environment
for key, original_value in original_state.items():
if original_value is None:
os.environ.pop(key, None)
else:
os.environ[key] = original_value

72
vllm/tracing/utils.py Normal file
View File

@@ -0,0 +1,72 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Mapping
from vllm.logger import init_logger
from vllm.utils.func_utils import run_once
logger = init_logger(__name__)
# Standard W3C headers used for context propagation
TRACE_HEADERS = ["traceparent", "tracestate"]
class SpanAttributes:
"""
Standard attributes for spans.
These are largely based on OpenTelemetry Semantic Conventions but are defined
here as constants so they can be used by any backend or logger.
"""
# Attribute names copied from OTel semantic conventions to avoid version conflicts
GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens"
GEN_AI_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens"
GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p"
GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
# Custom attributes added until they are standardized
GEN_AI_REQUEST_ID = "gen_ai.request.id"
GEN_AI_REQUEST_N = "gen_ai.request.n"
GEN_AI_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
GEN_AI_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e"
GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
# Latency breakdowns
GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = "gen_ai.latency.time_in_model_forward"
GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = "gen_ai.latency.time_in_model_execute"
GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL = "gen_ai.latency.time_in_model_prefill"
GEN_AI_LATENCY_TIME_IN_MODEL_DECODE = "gen_ai.latency.time_in_model_decode"
GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE = "gen_ai.latency.time_in_model_inference"
class LoadingSpanAttributes:
"""Custom attributes for code-level tracing (file, line number)."""
CODE_NAMESPACE = "code.namespace"
CODE_FUNCTION = "code.function"
CODE_FILEPATH = "code.filepath"
CODE_LINENO = "code.lineno"
def contains_trace_headers(headers: Mapping[str, str]) -> bool:
"""Check if the provided headers dictionary contains trace context."""
return any(h in headers for h in TRACE_HEADERS)
def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
"""
Extract only trace-related headers from a larger header dictionary.
Useful for logging or passing context to a non-OTel client.
"""
return {h: headers[h] for h in TRACE_HEADERS if h in headers}
@run_once
def log_tracing_disabled_warning() -> None:
logger.warning("Received a request with trace context but tracing is disabled")