Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -1,3 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
 import datetime
 import json
 import logging
@@ -7,7 +10,7 @@ import time
 from enum import Enum
 from pathlib import Path
 from threading import Thread
-from typing import Any, Dict, Optional
+from typing import Any
 from uuid import uuid4

 import cpuinfo
@@ -16,14 +19,36 @@ import requests
 import torch

 import vllm.envs as envs
+from vllm.connections import global_http_connection
+from vllm.logger import init_logger
+from vllm.utils.platform_utils import cuda_get_device_properties
+from vllm.utils.torch_utils import cuda_device_count_stateless
+from vllm.version import __version__ as VLLM_VERSION
+
+logger = init_logger(__name__)

 _config_home = envs.VLLM_CONFIG_ROOT
-_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
-_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home,
-                                              "vllm/do_not_track")
+_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "usage_stats.json")
+_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track")
 _USAGE_STATS_ENABLED = None
 _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER

+_GLOBAL_RUNTIME_DATA = dict[str, str | int | bool]()
+
+_USAGE_ENV_VARS_TO_COLLECT = [
+    "VLLM_USE_MODELSCOPE",
+    "VLLM_ATTENTION_BACKEND",
+    "VLLM_USE_FLASHINFER_SAMPLER",
+    "VLLM_PP_LAYER_PARTITION",
+    "VLLM_USE_TRITON_AWQ",
+    "VLLM_ENABLE_V1_MULTIPROCESSING",
+]
+
+
+def set_runtime_usage_data(key: str, value: str | int | bool) -> None:
+    """Set global usage data that will be sent with every usage heartbeat."""
+    _GLOBAL_RUNTIME_DATA[key] = value
+

 def is_usage_stats_enabled():
    """Determine whether or not we can send usage stats to the server.
@@ -42,8 +67,7 @@ def is_usage_stats_enabled():
        no_usage_stats = envs.VLLM_NO_USAGE_STATS
        do_not_track_file = os.path.exists(_USAGE_STATS_DO_NOT_TRACK_PATH)

-        _USAGE_STATS_ENABLED = not (do_not_track or no_usage_stats
-                                    or do_not_track_file)
+        _USAGE_STATS_ENABLED = not (do_not_track or no_usage_stats or do_not_track_file)
    return _USAGE_STATS_ENABLED


@@ -54,9 +78,11 @@ def _get_current_timestamp_ns() -> int:
 def _detect_cloud_provider() -> str:
    # Try detecting through vendor file
    vendor_files = [
-        "/sys/class/dmi/id/product_version", "/sys/class/dmi/id/bios_vendor",
+        "/sys/class/dmi/id/product_version",
+        "/sys/class/dmi/id/bios_vendor",
        "/sys/class/dmi/id/product_name",
-        "/sys/class/dmi/id/chassis_asset_tag", "/sys/class/dmi/id/sys_vendor"
+        "/sys/class/dmi/id/chassis_asset_tag",
+        "/sys/class/dmi/id/sys_vendor",
    ]
    # Mapping of identifiable strings to cloud providers
    cloud_identifiers = {
@@ -90,6 +116,7 @@ class UsageContext(str, Enum):
    LLM_CLASS = "LLM_CLASS"
    API_SERVER = "API_SERVER"
    OPENAI_API_SERVER = "OPENAI_API_SERVER"
+    OPENAI_BATCH_RUNNER = "OPENAI_BATCH_RUNNER"
    ENGINE_CONTEXT = "ENGINE_CONTEXT"


@@ -103,50 +130,97 @@ class UsageMessage:
        self.uuid = str(uuid4())

        # Environment Information
-        self.provider: Optional[str] = None
-        self.num_cpu: Optional[int] = None
-        self.cpu_type: Optional[str] = None
-        self.cpu_family_model_stepping: Optional[str] = None
-        self.total_memory: Optional[int] = None
-        self.architecture: Optional[str] = None
-        self.platform: Optional[str] = None
-        self.gpu_count: Optional[int] = None
-        self.gpu_type: Optional[str] = None
-        self.gpu_memory_per_device: Optional[int] = None
+        self.provider: str | None = None
+        self.num_cpu: int | None = None
+        self.cpu_type: str | None = None
+        self.cpu_family_model_stepping: str | None = None
+        self.total_memory: int | None = None
+        self.architecture: str | None = None
+        self.platform: str | None = None
+        self.cuda_runtime: str | None = None
+        self.gpu_count: int | None = None
+        self.gpu_type: str | None = None
+        self.gpu_memory_per_device: int | None = None
+        self.env_var_json: str | None = None

        # vLLM Information
-        self.model_architecture: Optional[str] = None
-        self.vllm_version: Optional[str] = None
-        self.context: Optional[str] = None
+        self.model_architecture: str | None = None
+        self.vllm_version: str | None = None
+        self.context: str | None = None

        # Metadata
-        self.log_time: Optional[int] = None
-        self.source: Optional[str] = None
+        self.log_time: int | None = None
+        self.source: str | None = None

-    def report_usage(self,
-                     model_architecture: str,
-                     usage_context: UsageContext,
-                     extra_kvs: Optional[Dict[str, Any]] = None) -> None:
-        t = Thread(target=self._report_usage_worker,
-                   args=(model_architecture, usage_context, extra_kvs or {}),
-                   daemon=True)
+    def report_usage(
+        self,
+        model_architecture: str,
+        usage_context: UsageContext,
+        extra_kvs: dict[str, Any] | None = None,
+    ) -> None:
+        t = Thread(
+            target=self._report_usage_worker,
+            args=(model_architecture, usage_context, extra_kvs or {}),
+            daemon=True,
+        )
        t.start()

-    def _report_usage_worker(self, model_architecture: str,
-                             usage_context: UsageContext,
-                             extra_kvs: Dict[str, Any]) -> None:
+    def _report_usage_worker(
+        self,
+        model_architecture: str,
+        usage_context: UsageContext,
+        extra_kvs: dict[str, Any],
+    ) -> None:
        self._report_usage_once(model_architecture, usage_context, extra_kvs)
-        self._report_continous_usage()
+        self._report_continuous_usage()

-    def _report_usage_once(self, model_architecture: str,
-                           usage_context: UsageContext,
-                           extra_kvs: Dict[str, Any]) -> None:
+    def _report_tpu_inference_usage(self) -> bool:
+        try:
+            from tpu_inference import tpu_info, utils
+
+            self.gpu_count = tpu_info.get_num_chips()
+            self.gpu_type = tpu_info.get_tpu_type()
+            self.gpu_memory_per_device = utils.get_device_hbm_limit()
+            self.cuda_runtime = "tpu_inference"
+            return True
+        except Exception:
+            return False
+
+    def _report_torch_xla_usage(self) -> bool:
+        try:
+            import torch_xla
+
+            self.gpu_count = torch_xla.runtime.world_size()
+            self.gpu_type = torch_xla.tpu.get_tpu_type()
+            self.gpu_memory_per_device = torch_xla.core.xla_model.get_memory_info()[
+                "bytes_limit"
+            ]
+            self.cuda_runtime = "torch_xla"
+            return True
+        except Exception:
+            return False
+
+    def _report_usage_once(
+        self,
+        model_architecture: str,
+        usage_context: UsageContext,
+        extra_kvs: dict[str, Any],
+    ) -> None:
        # Platform information
-        if torch.cuda.is_available():
-            device_property = torch.cuda.get_device_properties(0)
-            self.gpu_count = torch.cuda.device_count()
-            self.gpu_type = device_property.name
-            self.gpu_memory_per_device = device_property.total_memory
+        from vllm.platforms import current_platform
+
+        if current_platform.is_cuda_alike():
+            self.gpu_count = cuda_device_count_stateless()
+            self.gpu_type, self.gpu_memory_per_device = cuda_get_device_properties(
+                0, ("name", "total_memory")
+            )
+        if current_platform.is_cuda():
+            self.cuda_runtime = torch.version.cuda
+        if current_platform.is_tpu():  # noqa: SIM102
+            if (not self._report_tpu_inference_usage()) and (
+                not self._report_torch_xla_usage()
+            ):
+                logger.exception("Failed to collect TPU information")
        self.provider = _detect_cloud_provider()
        self.architecture = platform.machine()
        self.platform = platform.platform()
@@ -155,18 +229,24 @@ class UsageMessage:
        info = cpuinfo.get_cpu_info()
        self.num_cpu = info.get("count", None)
        self.cpu_type = info.get("brand_raw", "")
-        self.cpu_family_model_stepping = ",".join([
-            str(info.get("family", "")),
-            str(info.get("model", "")),
-            str(info.get("stepping", ""))
-        ])
+        self.cpu_family_model_stepping = ",".join(
+            [
+                str(info.get("family", "")),
+                str(info.get("model", "")),
+                str(info.get("stepping", "")),
+            ]
+        )

        # vLLM information
-        import vllm  # delayed import to prevent circular import
        self.context = usage_context.value
-        self.vllm_version = vllm.__version__
+        self.vllm_version = VLLM_VERSION
        self.model_architecture = model_architecture

+        # Environment variables
+        self.env_var_json = json.dumps(
+            {env_var: getattr(envs, env_var) for env_var in _USAGE_ENV_VARS_TO_COLLECT}
+        )
+
        # Metadata
        self.log_time = _get_current_timestamp_ns()
        self.source = envs.VLLM_USAGE_SOURCE
@@ -178,7 +258,7 @@ class UsageMessage:
        self._write_to_file(data)
        self._send_to_server(data)

-    def _report_continous_usage(self):
+    def _report_continuous_usage(self):
        """Report usage every 10 minutes.

        This helps us to collect more data points for uptime of vLLM usages.
@@ -186,19 +266,24 @@ class UsageMessage:
        """
        while True:
            time.sleep(600)
-            data = {"uuid": self.uuid, "log_time": _get_current_timestamp_ns()}
+            data = {
+                "uuid": self.uuid,
+                "log_time": _get_current_timestamp_ns(),
+            }
+            data.update(_GLOBAL_RUNTIME_DATA)

            self._write_to_file(data)
            self._send_to_server(data)

-    def _send_to_server(self, data):
+    def _send_to_server(self, data: dict[str, Any]) -> None:
        try:
-            requests.post(_USAGE_STATS_SERVER, json=data)
+            global_http_client = global_http_connection.get_sync_client()
+            global_http_client.post(_USAGE_STATS_SERVER, json=data)
        except requests.exceptions.RequestException:
            # silently ignore unless we are using debug log
            logging.debug("Failed to send usage data to server")

-    def _write_to_file(self, data):
+    def _write_to_file(self, data: dict[str, Any]) -> None:
        os.makedirs(os.path.dirname(_USAGE_STATS_JSON_PATH), exist_ok=True)
        Path(_USAGE_STATS_JSON_PATH).touch(exist_ok=True)
        with open(_USAGE_STATS_JSON_PATH, "a") as f: