Sync from v0.13
This commit is contained in:
82
vllm/v1/metrics/prometheus.py
Normal file
82
vllm/v1/metrics/prometheus.py
Normal file
@@ -0,0 +1,82 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from prometheus_client import REGISTRY, CollectorRegistry, multiprocess
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# Global temporary directory for prometheus multiprocessing
|
||||
_prometheus_multiproc_dir: tempfile.TemporaryDirectory | None = None
|
||||
|
||||
|
||||
def setup_multiprocess_prometheus():
|
||||
"""Set up prometheus multiprocessing directory if not already configured."""
|
||||
global _prometheus_multiproc_dir
|
||||
|
||||
if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
|
||||
# Make TemporaryDirectory for prometheus multiprocessing
|
||||
# Note: global TemporaryDirectory will be automatically
|
||||
# cleaned up upon exit.
|
||||
_prometheus_multiproc_dir = tempfile.TemporaryDirectory()
|
||||
os.environ["PROMETHEUS_MULTIPROC_DIR"] = _prometheus_multiproc_dir.name
|
||||
logger.debug(
|
||||
"Created PROMETHEUS_MULTIPROC_DIR at %s", _prometheus_multiproc_dir.name
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"Found PROMETHEUS_MULTIPROC_DIR was set by user. "
|
||||
"This directory must be wiped between vLLM runs or "
|
||||
"you will find inaccurate metrics. Unset the variable "
|
||||
"and vLLM will properly handle cleanup."
|
||||
)
|
||||
|
||||
|
||||
def get_prometheus_registry() -> CollectorRegistry:
|
||||
"""Get the appropriate prometheus registry based on multiprocessing
|
||||
configuration.
|
||||
|
||||
Returns:
|
||||
Registry: A prometheus registry
|
||||
"""
|
||||
if os.getenv("PROMETHEUS_MULTIPROC_DIR") is not None:
|
||||
logger.debug("Using multiprocess registry for prometheus metrics")
|
||||
registry = CollectorRegistry()
|
||||
multiprocess.MultiProcessCollector(registry)
|
||||
return registry
|
||||
|
||||
return REGISTRY
|
||||
|
||||
|
||||
def unregister_vllm_metrics():
|
||||
"""Unregister any existing vLLM collectors from the prometheus registry.
|
||||
|
||||
This is useful for testing and CI/CD where metrics may be registered
|
||||
multiple times across test runs.
|
||||
|
||||
Also, in case of multiprocess, we need to unregister the metrics from the
|
||||
global registry.
|
||||
"""
|
||||
registry = REGISTRY
|
||||
# Unregister any existing vLLM collectors
|
||||
for collector in list(registry._collector_to_names):
|
||||
if hasattr(collector, "_name") and "vllm" in collector._name:
|
||||
registry.unregister(collector)
|
||||
|
||||
|
||||
def shutdown_prometheus():
|
||||
"""Shutdown prometheus metrics."""
|
||||
|
||||
path = _prometheus_multiproc_dir
|
||||
if path is None:
|
||||
return
|
||||
try:
|
||||
pid = os.getpid()
|
||||
multiprocess.mark_process_dead(pid, path)
|
||||
logger.debug("Marked Prometheus metrics for process %d as dead", pid)
|
||||
except Exception as e:
|
||||
logger.error("Error during metrics cleanup: %s", str(e))
|
||||
Reference in New Issue
Block a user