[Core][Misc] Clean up ProfileExecuteDuration (#6461)

### What this PR does / why we need it? This PR removes the custom `ProfileExecuteDuration` utility and its usages across the codebase. This utility was used for profiling execution duration of different stages in the inference process. It is replaced by the standard `vllm.v1.utils.record_function_or_nullcontext`, which integrates with PyTorch's profiler. This change simplifies the code by removing a custom implementation in favor of an upstream utility, improving maintainability. Associated documentation and tests for `ProfileExecuteDuration` are also removed. ### Does this PR introduce _any_ user-facing change? `VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE` env is removed now. ### How was this patch tested? CI passed. The changes are a cleanup and replacement with a standard utility. Existing tests cover the functionality. The removed feature had its own tests which are also removed. Related RFC: #5304 - vLLM version: v0.14.1 - vLLM main: dc917cceb8 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-02-01 20:06:01 +08:00
parent 775fbc4cd2
commit b4aafd4293
10 changed files with 12 additions and 244 deletions
--- a/tests/e2e/singlecard/test_profile_execute_duration.py
+++ b/tests/e2e/singlecard/test_profile_execute_duration.py
@@ -1,71 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
-# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import gc
-import os
-import time
-from unittest.mock import patch
-
-import torch
-import vllm  # noqa: F401
-
-from vllm_ascend.utils import ProfileExecuteDuration
-
-
-@patch.dict(os.environ, {"VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE": "1"})
-def test_execue_duration_enabled_discrepancy():
-    a = torch.randn(10000, 10000).npu()
-    b = torch.randn(10000, 10000).npu()
-
-    # warmup
-    torch.matmul(a, b)
-    torch.npu.synchronize()
-
-    cpu_start = time.perf_counter()
-    with ProfileExecuteDuration().capture_async("forward"):
-        torch.matmul(a, b)
-        torch.npu.synchronize()
-        cpu_duration = (time.perf_counter() - cpu_start) * 1000
-    npu_durations = ProfileExecuteDuration().pop_captured_sync()
-    assert npu_durations and 'forward' in npu_durations
-    assert not ProfileExecuteDuration._observations
-
-    # Assert discrepancy between CPU and NPU duration is within 50% roughly
-    diff = abs(cpu_duration - npu_durations['forward']) / max(
-        cpu_duration, npu_durations['forward'])
-    assert diff <= 0.5, (
-        f"CPU={cpu_duration:.2f}ms, NPU={npu_durations['forward']:.2f}ms")
-
-    gc.collect()
-    torch.npu.empty_cache()
-    torch.npu.reset_peak_memory_stats()
-
-
-def test_execue_duration_disabled():
-    a = torch.randn(100, 100).npu()
-    b = torch.randn(100, 100).npu()
-
-    with ProfileExecuteDuration().capture_async("forward"):
-        torch.matmul(a, b)
-        torch.npu.synchronize()
-    npu_durations = ProfileExecuteDuration().pop_captured_sync()
-    assert not npu_durations
-
-    gc.collect()
-    torch.npu.empty_cache()
-    torch.npu.reset_peak_memory_stats()
--- a/tests/ut/test_utils.py
+++ b/tests/ut/test_utils.py
@@ -246,56 +246,3 @@ class TestUtils(TestBase):
        utils.register_ascend_customop()
        self.assertEqual(mock_customop.register_oot.call_count,
                         len(REGISTERED_ASCEND_OPS))
-
-
-class TestProfileExecuteDuration(TestBase):
-
-    def setUp(self):
-        utils.ProfileExecuteDuration._instance = None
-        utils.ProfileExecuteDuration._observations = []
-        utils.ProfileExecuteDuration._lock = Lock()
-
-    def test_singleton_creation(self):
-        instance1 = utils.ProfileExecuteDuration()
-        self.assertIsNotNone(instance1)
-        self.assertIs(instance1, utils.ProfileExecuteDuration._instance)
-
-        instance2 = utils.ProfileExecuteDuration()
-        self.assertIs(instance1, instance2)
-
-    def test_thread_safety(self):
-        from threading import Thread
-
-        instances = []
-
-        def create_instance():
-            instances.append(utils.ProfileExecuteDuration())
-
-        threads = [Thread(target=create_instance) for _ in range(10)]
-        for t in threads:
-            t.start()
-        for t in threads:
-            t.join()
-
-        first_instance = instances[0]
-        for instance in instances[1:]:
-            self.assertIs(first_instance, instance)
-
-    def test_atexit_registration(self):
-        with mock.patch('atexit.register') as mock_register:
-            instance = utils.ProfileExecuteDuration()
-            mock_register.assert_called_once_with(instance.destroy)
-
-    def test_lock_usage(self):
-        original_lock = utils.ProfileExecuteDuration._lock
-
-        with mock.patch.object(utils.ProfileExecuteDuration,
-                               '_lock',
-                               wraps=original_lock) as mock_lock:
-            utils.ProfileExecuteDuration()
-            mock_lock.__enter__.assert_called()
-            mock_lock.__exit__.assert_called()
-
-    def test_observations_initialization(self):
-        instance = utils.ProfileExecuteDuration()
-        self.assertEqual(instance._observations, [])