Sync from v0.13
This commit is contained in:
@@ -1,3 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
@@ -7,7 +10,7 @@ import time
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from threading import Thread
|
||||
from typing import Any, Dict, Optional
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
import cpuinfo
|
||||
@@ -16,14 +19,36 @@ import requests
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.connections import global_http_connection
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.platform_utils import cuda_get_device_properties
|
||||
from vllm.utils.torch_utils import cuda_device_count_stateless
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_config_home = envs.VLLM_CONFIG_ROOT
|
||||
_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
|
||||
_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home,
|
||||
"vllm/do_not_track")
|
||||
_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "usage_stats.json")
|
||||
_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track")
|
||||
_USAGE_STATS_ENABLED = None
|
||||
_USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
|
||||
|
||||
_GLOBAL_RUNTIME_DATA = dict[str, str | int | bool]()
|
||||
|
||||
_USAGE_ENV_VARS_TO_COLLECT = [
|
||||
"VLLM_USE_MODELSCOPE",
|
||||
"VLLM_ATTENTION_BACKEND",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER",
|
||||
"VLLM_PP_LAYER_PARTITION",
|
||||
"VLLM_USE_TRITON_AWQ",
|
||||
"VLLM_ENABLE_V1_MULTIPROCESSING",
|
||||
]
|
||||
|
||||
|
||||
def set_runtime_usage_data(key: str, value: str | int | bool) -> None:
|
||||
"""Set global usage data that will be sent with every usage heartbeat."""
|
||||
_GLOBAL_RUNTIME_DATA[key] = value
|
||||
|
||||
|
||||
def is_usage_stats_enabled():
|
||||
"""Determine whether or not we can send usage stats to the server.
|
||||
@@ -42,8 +67,7 @@ def is_usage_stats_enabled():
|
||||
no_usage_stats = envs.VLLM_NO_USAGE_STATS
|
||||
do_not_track_file = os.path.exists(_USAGE_STATS_DO_NOT_TRACK_PATH)
|
||||
|
||||
_USAGE_STATS_ENABLED = not (do_not_track or no_usage_stats
|
||||
or do_not_track_file)
|
||||
_USAGE_STATS_ENABLED = not (do_not_track or no_usage_stats or do_not_track_file)
|
||||
return _USAGE_STATS_ENABLED
|
||||
|
||||
|
||||
@@ -54,9 +78,11 @@ def _get_current_timestamp_ns() -> int:
|
||||
def _detect_cloud_provider() -> str:
|
||||
# Try detecting through vendor file
|
||||
vendor_files = [
|
||||
"/sys/class/dmi/id/product_version", "/sys/class/dmi/id/bios_vendor",
|
||||
"/sys/class/dmi/id/product_version",
|
||||
"/sys/class/dmi/id/bios_vendor",
|
||||
"/sys/class/dmi/id/product_name",
|
||||
"/sys/class/dmi/id/chassis_asset_tag", "/sys/class/dmi/id/sys_vendor"
|
||||
"/sys/class/dmi/id/chassis_asset_tag",
|
||||
"/sys/class/dmi/id/sys_vendor",
|
||||
]
|
||||
# Mapping of identifiable strings to cloud providers
|
||||
cloud_identifiers = {
|
||||
@@ -90,6 +116,7 @@ class UsageContext(str, Enum):
|
||||
LLM_CLASS = "LLM_CLASS"
|
||||
API_SERVER = "API_SERVER"
|
||||
OPENAI_API_SERVER = "OPENAI_API_SERVER"
|
||||
OPENAI_BATCH_RUNNER = "OPENAI_BATCH_RUNNER"
|
||||
ENGINE_CONTEXT = "ENGINE_CONTEXT"
|
||||
|
||||
|
||||
@@ -103,50 +130,97 @@ class UsageMessage:
|
||||
self.uuid = str(uuid4())
|
||||
|
||||
# Environment Information
|
||||
self.provider: Optional[str] = None
|
||||
self.num_cpu: Optional[int] = None
|
||||
self.cpu_type: Optional[str] = None
|
||||
self.cpu_family_model_stepping: Optional[str] = None
|
||||
self.total_memory: Optional[int] = None
|
||||
self.architecture: Optional[str] = None
|
||||
self.platform: Optional[str] = None
|
||||
self.gpu_count: Optional[int] = None
|
||||
self.gpu_type: Optional[str] = None
|
||||
self.gpu_memory_per_device: Optional[int] = None
|
||||
self.provider: str | None = None
|
||||
self.num_cpu: int | None = None
|
||||
self.cpu_type: str | None = None
|
||||
self.cpu_family_model_stepping: str | None = None
|
||||
self.total_memory: int | None = None
|
||||
self.architecture: str | None = None
|
||||
self.platform: str | None = None
|
||||
self.cuda_runtime: str | None = None
|
||||
self.gpu_count: int | None = None
|
||||
self.gpu_type: str | None = None
|
||||
self.gpu_memory_per_device: int | None = None
|
||||
self.env_var_json: str | None = None
|
||||
|
||||
# vLLM Information
|
||||
self.model_architecture: Optional[str] = None
|
||||
self.vllm_version: Optional[str] = None
|
||||
self.context: Optional[str] = None
|
||||
self.model_architecture: str | None = None
|
||||
self.vllm_version: str | None = None
|
||||
self.context: str | None = None
|
||||
|
||||
# Metadata
|
||||
self.log_time: Optional[int] = None
|
||||
self.source: Optional[str] = None
|
||||
self.log_time: int | None = None
|
||||
self.source: str | None = None
|
||||
|
||||
def report_usage(self,
|
||||
model_architecture: str,
|
||||
usage_context: UsageContext,
|
||||
extra_kvs: Optional[Dict[str, Any]] = None) -> None:
|
||||
t = Thread(target=self._report_usage_worker,
|
||||
args=(model_architecture, usage_context, extra_kvs or {}),
|
||||
daemon=True)
|
||||
def report_usage(
|
||||
self,
|
||||
model_architecture: str,
|
||||
usage_context: UsageContext,
|
||||
extra_kvs: dict[str, Any] | None = None,
|
||||
) -> None:
|
||||
t = Thread(
|
||||
target=self._report_usage_worker,
|
||||
args=(model_architecture, usage_context, extra_kvs or {}),
|
||||
daemon=True,
|
||||
)
|
||||
t.start()
|
||||
|
||||
def _report_usage_worker(self, model_architecture: str,
|
||||
usage_context: UsageContext,
|
||||
extra_kvs: Dict[str, Any]) -> None:
|
||||
def _report_usage_worker(
|
||||
self,
|
||||
model_architecture: str,
|
||||
usage_context: UsageContext,
|
||||
extra_kvs: dict[str, Any],
|
||||
) -> None:
|
||||
self._report_usage_once(model_architecture, usage_context, extra_kvs)
|
||||
self._report_continous_usage()
|
||||
self._report_continuous_usage()
|
||||
|
||||
def _report_usage_once(self, model_architecture: str,
|
||||
usage_context: UsageContext,
|
||||
extra_kvs: Dict[str, Any]) -> None:
|
||||
def _report_tpu_inference_usage(self) -> bool:
|
||||
try:
|
||||
from tpu_inference import tpu_info, utils
|
||||
|
||||
self.gpu_count = tpu_info.get_num_chips()
|
||||
self.gpu_type = tpu_info.get_tpu_type()
|
||||
self.gpu_memory_per_device = utils.get_device_hbm_limit()
|
||||
self.cuda_runtime = "tpu_inference"
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _report_torch_xla_usage(self) -> bool:
|
||||
try:
|
||||
import torch_xla
|
||||
|
||||
self.gpu_count = torch_xla.runtime.world_size()
|
||||
self.gpu_type = torch_xla.tpu.get_tpu_type()
|
||||
self.gpu_memory_per_device = torch_xla.core.xla_model.get_memory_info()[
|
||||
"bytes_limit"
|
||||
]
|
||||
self.cuda_runtime = "torch_xla"
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _report_usage_once(
|
||||
self,
|
||||
model_architecture: str,
|
||||
usage_context: UsageContext,
|
||||
extra_kvs: dict[str, Any],
|
||||
) -> None:
|
||||
# Platform information
|
||||
if torch.cuda.is_available():
|
||||
device_property = torch.cuda.get_device_properties(0)
|
||||
self.gpu_count = torch.cuda.device_count()
|
||||
self.gpu_type = device_property.name
|
||||
self.gpu_memory_per_device = device_property.total_memory
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_cuda_alike():
|
||||
self.gpu_count = cuda_device_count_stateless()
|
||||
self.gpu_type, self.gpu_memory_per_device = cuda_get_device_properties(
|
||||
0, ("name", "total_memory")
|
||||
)
|
||||
if current_platform.is_cuda():
|
||||
self.cuda_runtime = torch.version.cuda
|
||||
if current_platform.is_tpu(): # noqa: SIM102
|
||||
if (not self._report_tpu_inference_usage()) and (
|
||||
not self._report_torch_xla_usage()
|
||||
):
|
||||
logger.exception("Failed to collect TPU information")
|
||||
self.provider = _detect_cloud_provider()
|
||||
self.architecture = platform.machine()
|
||||
self.platform = platform.platform()
|
||||
@@ -155,18 +229,24 @@ class UsageMessage:
|
||||
info = cpuinfo.get_cpu_info()
|
||||
self.num_cpu = info.get("count", None)
|
||||
self.cpu_type = info.get("brand_raw", "")
|
||||
self.cpu_family_model_stepping = ",".join([
|
||||
str(info.get("family", "")),
|
||||
str(info.get("model", "")),
|
||||
str(info.get("stepping", ""))
|
||||
])
|
||||
self.cpu_family_model_stepping = ",".join(
|
||||
[
|
||||
str(info.get("family", "")),
|
||||
str(info.get("model", "")),
|
||||
str(info.get("stepping", "")),
|
||||
]
|
||||
)
|
||||
|
||||
# vLLM information
|
||||
import vllm # delayed import to prevent circular import
|
||||
self.context = usage_context.value
|
||||
self.vllm_version = vllm.__version__
|
||||
self.vllm_version = VLLM_VERSION
|
||||
self.model_architecture = model_architecture
|
||||
|
||||
# Environment variables
|
||||
self.env_var_json = json.dumps(
|
||||
{env_var: getattr(envs, env_var) for env_var in _USAGE_ENV_VARS_TO_COLLECT}
|
||||
)
|
||||
|
||||
# Metadata
|
||||
self.log_time = _get_current_timestamp_ns()
|
||||
self.source = envs.VLLM_USAGE_SOURCE
|
||||
@@ -178,7 +258,7 @@ class UsageMessage:
|
||||
self._write_to_file(data)
|
||||
self._send_to_server(data)
|
||||
|
||||
def _report_continous_usage(self):
|
||||
def _report_continuous_usage(self):
|
||||
"""Report usage every 10 minutes.
|
||||
|
||||
This helps us to collect more data points for uptime of vLLM usages.
|
||||
@@ -186,19 +266,24 @@ class UsageMessage:
|
||||
"""
|
||||
while True:
|
||||
time.sleep(600)
|
||||
data = {"uuid": self.uuid, "log_time": _get_current_timestamp_ns()}
|
||||
data = {
|
||||
"uuid": self.uuid,
|
||||
"log_time": _get_current_timestamp_ns(),
|
||||
}
|
||||
data.update(_GLOBAL_RUNTIME_DATA)
|
||||
|
||||
self._write_to_file(data)
|
||||
self._send_to_server(data)
|
||||
|
||||
def _send_to_server(self, data):
|
||||
def _send_to_server(self, data: dict[str, Any]) -> None:
|
||||
try:
|
||||
requests.post(_USAGE_STATS_SERVER, json=data)
|
||||
global_http_client = global_http_connection.get_sync_client()
|
||||
global_http_client.post(_USAGE_STATS_SERVER, json=data)
|
||||
except requests.exceptions.RequestException:
|
||||
# silently ignore unless we are using debug log
|
||||
logging.debug("Failed to send usage data to server")
|
||||
|
||||
def _write_to_file(self, data):
|
||||
def _write_to_file(self, data: dict[str, Any]) -> None:
|
||||
os.makedirs(os.path.dirname(_USAGE_STATS_JSON_PATH), exist_ok=True)
|
||||
Path(_USAGE_STATS_JSON_PATH).touch(exist_ok=True)
|
||||
with open(_USAGE_STATS_JSON_PATH, "a") as f:
|
||||
|
||||
Reference in New Issue
Block a user