[Core][Misc] Clean up ProfileExecuteDuration (#6461)
### What this PR does / why we need it?
This PR removes the custom `ProfileExecuteDuration` utility and its
usages across the codebase. This utility was used for profiling
execution duration of different stages in the inference process. It is
replaced by the standard `vllm.v1.utils.record_function_or_nullcontext`,
which integrates with PyTorch's profiler.
This change simplifies the code by removing a custom implementation in
favor of an upstream utility, improving maintainability. Associated
documentation and tests for `ProfileExecuteDuration` are also removed.
### Does this PR introduce _any_ user-facing change?
`VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE` env is removed now.
### How was this patch tested?
CI passed. The changes are a cleanup and replacement with a standard
utility. Existing tests cover the functionality. The removed feature had
its own tests which are also removed.
Related RFC: #5304
- vLLM version: v0.14.1
- vLLM main:
dc917cceb8
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -23,7 +23,7 @@ import atexit
|
||||
import functools
|
||||
import math
|
||||
import os
|
||||
from contextlib import contextmanager, nullcontext
|
||||
from contextlib import nullcontext
|
||||
from enum import Enum
|
||||
from functools import lru_cache
|
||||
from threading import Lock
|
||||
@@ -32,7 +32,6 @@ from typing import TYPE_CHECKING, Any
|
||||
import torch
|
||||
import torch_npu # noqa: F401
|
||||
from packaging.version import InvalidVersion, Version
|
||||
from torch_npu.npu.streams import Event
|
||||
from vllm.logger import logger
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
@@ -562,53 +561,6 @@ def dispose_tensor(x: torch.Tensor):
|
||||
x.set_(torch.empty((0,), device=x.device, dtype=x.dtype))
|
||||
|
||||
|
||||
class ProfileExecuteDuration:
|
||||
_instance = None
|
||||
_observations: list[tuple[str, Event, Event]] = []
|
||||
_lock = Lock()
|
||||
|
||||
def __new__(cls):
|
||||
with cls._lock:
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
atexit.register(cls._instance.destroy)
|
||||
return cls._instance
|
||||
|
||||
def destroy(self):
|
||||
with self._lock:
|
||||
self._observations.clear()
|
||||
|
||||
@contextmanager
|
||||
def capture_async(self, duration_tag: str):
|
||||
if not envs_ascend.VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE:
|
||||
yield
|
||||
return
|
||||
|
||||
observe_start = Event(enable_timing=True)
|
||||
observe_start.record()
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
observe_end = Event(enable_timing=True)
|
||||
observe_end.record()
|
||||
with self._lock:
|
||||
self._observations.append((duration_tag, observe_start, observe_end))
|
||||
|
||||
def pop_captured_sync(self) -> dict:
|
||||
"""Pop and synchronize all events in the observation list"""
|
||||
durations: dict[str, float] = {}
|
||||
if not envs_ascend.VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE:
|
||||
return durations
|
||||
|
||||
while self._observations:
|
||||
with self._lock:
|
||||
tag, observe_start, observe_end = self._observations.pop()
|
||||
observe_end.synchronize()
|
||||
durations[tag] = observe_start.elapsed_time(observe_end)
|
||||
|
||||
return durations
|
||||
|
||||
|
||||
def register_ascend_customop(vllm_config: VllmConfig | None = None):
|
||||
"""Register Ascend CustomOP
|
||||
|
||||
|
||||
Reference in New Issue
Block a user