73 lines
2.7 KiB
Python
73 lines
2.7 KiB
Python
|
|
# SPDX-License-Identifier: Apache-2.0
|
||
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||
|
|
|
||
|
|
from collections.abc import Mapping
|
||
|
|
|
||
|
|
from vllm.logger import init_logger
|
||
|
|
from vllm.utils.func_utils import run_once
|
||
|
|
|
||
|
|
logger = init_logger(__name__)
|
||
|
|
|
||
|
|
# Standard W3C headers used for context propagation
|
||
|
|
TRACE_HEADERS = ["traceparent", "tracestate"]
|
||
|
|
|
||
|
|
|
||
|
|
class SpanAttributes:
|
||
|
|
"""
|
||
|
|
Standard attributes for spans.
|
||
|
|
|
||
|
|
These are largely based on OpenTelemetry Semantic Conventions but are defined
|
||
|
|
here as constants so they can be used by any backend or logger.
|
||
|
|
"""
|
||
|
|
|
||
|
|
# Attribute names copied from OTel semantic conventions to avoid version conflicts
|
||
|
|
GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens"
|
||
|
|
GEN_AI_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens"
|
||
|
|
GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
|
||
|
|
GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p"
|
||
|
|
GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
|
||
|
|
GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
|
||
|
|
|
||
|
|
# Custom attributes added until they are standardized
|
||
|
|
GEN_AI_REQUEST_ID = "gen_ai.request.id"
|
||
|
|
GEN_AI_REQUEST_N = "gen_ai.request.n"
|
||
|
|
GEN_AI_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
|
||
|
|
GEN_AI_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
|
||
|
|
GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
|
||
|
|
GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e"
|
||
|
|
GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
|
||
|
|
|
||
|
|
# Latency breakdowns
|
||
|
|
GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = "gen_ai.latency.time_in_model_forward"
|
||
|
|
GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = "gen_ai.latency.time_in_model_execute"
|
||
|
|
GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL = "gen_ai.latency.time_in_model_prefill"
|
||
|
|
GEN_AI_LATENCY_TIME_IN_MODEL_DECODE = "gen_ai.latency.time_in_model_decode"
|
||
|
|
GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE = "gen_ai.latency.time_in_model_inference"
|
||
|
|
|
||
|
|
|
||
|
|
class LoadingSpanAttributes:
|
||
|
|
"""Custom attributes for code-level tracing (file, line number)."""
|
||
|
|
|
||
|
|
CODE_NAMESPACE = "code.namespace"
|
||
|
|
CODE_FUNCTION = "code.function"
|
||
|
|
CODE_FILEPATH = "code.filepath"
|
||
|
|
CODE_LINENO = "code.lineno"
|
||
|
|
|
||
|
|
|
||
|
|
def contains_trace_headers(headers: Mapping[str, str]) -> bool:
|
||
|
|
"""Check if the provided headers dictionary contains trace context."""
|
||
|
|
return any(h in headers for h in TRACE_HEADERS)
|
||
|
|
|
||
|
|
|
||
|
|
def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
|
||
|
|
"""
|
||
|
|
Extract only trace-related headers from a larger header dictionary.
|
||
|
|
Useful for logging or passing context to a non-OTel client.
|
||
|
|
"""
|
||
|
|
return {h: headers[h] for h in TRACE_HEADERS if h in headers}
|
||
|
|
|
||
|
|
|
||
|
|
@run_once
|
||
|
|
def log_tracing_disabled_warning() -> None:
|
||
|
|
logger.warning("Received a request with trace context but tracing is disabled")
|