83 lines
2.7 KiB
Python
83 lines
2.7 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import os
|
|
import tempfile
|
|
|
|
from prometheus_client import REGISTRY, CollectorRegistry, multiprocess
|
|
|
|
from vllm.logger import init_logger
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
# Global temporary directory for prometheus multiprocessing
|
|
_prometheus_multiproc_dir: tempfile.TemporaryDirectory | None = None
|
|
|
|
|
|
def setup_multiprocess_prometheus():
|
|
"""Set up prometheus multiprocessing directory if not already configured."""
|
|
global _prometheus_multiproc_dir
|
|
|
|
if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
|
|
# Make TemporaryDirectory for prometheus multiprocessing
|
|
# Note: global TemporaryDirectory will be automatically
|
|
# cleaned up upon exit.
|
|
_prometheus_multiproc_dir = tempfile.TemporaryDirectory()
|
|
os.environ["PROMETHEUS_MULTIPROC_DIR"] = _prometheus_multiproc_dir.name
|
|
logger.debug(
|
|
"Created PROMETHEUS_MULTIPROC_DIR at %s", _prometheus_multiproc_dir.name
|
|
)
|
|
else:
|
|
logger.warning(
|
|
"Found PROMETHEUS_MULTIPROC_DIR was set by user. "
|
|
"This directory must be wiped between vLLM runs or "
|
|
"you will find inaccurate metrics. Unset the variable "
|
|
"and vLLM will properly handle cleanup."
|
|
)
|
|
|
|
|
|
def get_prometheus_registry() -> CollectorRegistry:
|
|
"""Get the appropriate prometheus registry based on multiprocessing
|
|
configuration.
|
|
|
|
Returns:
|
|
Registry: A prometheus registry
|
|
"""
|
|
if os.getenv("PROMETHEUS_MULTIPROC_DIR") is not None:
|
|
logger.debug("Using multiprocess registry for prometheus metrics")
|
|
registry = CollectorRegistry()
|
|
multiprocess.MultiProcessCollector(registry)
|
|
return registry
|
|
|
|
return REGISTRY
|
|
|
|
|
|
def unregister_vllm_metrics():
|
|
"""Unregister any existing vLLM collectors from the prometheus registry.
|
|
|
|
This is useful for testing and CI/CD where metrics may be registered
|
|
multiple times across test runs.
|
|
|
|
Also, in case of multiprocess, we need to unregister the metrics from the
|
|
global registry.
|
|
"""
|
|
registry = REGISTRY
|
|
# Unregister any existing vLLM collectors
|
|
for collector in list(registry._collector_to_names):
|
|
if hasattr(collector, "_name") and "vllm" in collector._name:
|
|
registry.unregister(collector)
|
|
|
|
|
|
def shutdown_prometheus():
|
|
"""Shutdown prometheus metrics."""
|
|
|
|
path = _prometheus_multiproc_dir
|
|
if path is None:
|
|
return
|
|
try:
|
|
pid = os.getpid()
|
|
multiprocess.mark_process_dead(pid, path)
|
|
logger.debug("Marked Prometheus metrics for process %d as dead", pid)
|
|
except Exception as e:
|
|
logger.error("Error during metrics cleanup: %s", str(e))
|