Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/utils_/test_mem_utils.py
+++ b/tests/utils_/test_mem_utils.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from vllm_test_utils.monitor import monitor
+
+from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
+
+from ..utils import create_new_process_for_each_test
+
+
+@create_new_process_for_each_test()
+def test_memory_profiling():
+    # Fake out some model loading + inference memory usage to test profiling
+    # Memory used by other processes will show up as cuda usage outside of torch
+    from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+
+    lib = CudaRTLibrary()
+    # 512 MiB allocation outside of this instance
+    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
+
+    baseline_snapshot = MemorySnapshot()
+
+    # load weights
+
+    weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
+
+    weights_memory = 128 * 1024 * 1024 * 4  # 512 MiB
+
+    def measure_current_non_torch():
+        free, total = torch.cuda.mem_get_info()
+        current_used = total - free
+        current_torch = torch.cuda.memory_reserved()
+        current_non_torch = current_used - current_torch
+        return current_non_torch
+
+    with (
+        memory_profiling(
+            baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
+        ) as result,
+        monitor(measure_current_non_torch) as monitored_values,
+    ):
+        # make a memory spike, 1 GiB
+        spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
+        del spike
+
+        # Add some extra non-torch memory 256 MiB (simulate NCCL)
+        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
+
+    # this is an analytic value, it is exact,
+    # we only have 256 MiB non-torch memory increase
+    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
+    assert measured_diff == 256 * 1024 * 1024
+
+    # Check that the memory usage is within 5% of the expected values
+    # 5% tolerance is caused by cuda runtime.
+    # we cannot control cuda runtime in the granularity of bytes,
+    # which causes a small error (<10 MiB in practice)
+    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024)  # noqa
+    assert abs(non_torch_ratio - 1) <= 0.05
+    assert result.torch_peak_increase == 1024 * 1024 * 1024
+    del weights
+    lib.cudaFree(handle1)
+    lib.cudaFree(handle2)