update

2026-04-09 11:23:47 +08:00
parent 8082d5f4b2
commit 72387e4fa8
1885 changed files with 611521 additions and 1 deletions
--- a/vllm/profiler/init.py
+++ b/vllm/profiler/init.py
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -0,0 +1,400 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copy
+from collections import defaultdict
+from collections.abc import Callable
+from dataclasses import asdict, dataclass, field
+from typing import Any, Generic, TypeAlias, TypeVar
+
+from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult
+from torch._C._profiler import _EventType, _ExperimentalConfig, _ProfilerEvent
+from torch.autograd.profiler import FunctionEvent
+from torch.profiler import ProfilerActivity, profile
+
+from vllm.profiler.utils import (
+    TablePrinter,
+    event_has_module,
+    event_is_torch_op,
+    event_module_repr,
+    event_torch_op_stack_trace,
+    indent_string,
+)
+from vllm.utils.import_utils import PlaceholderModule
+
+try:
+    import pandas as pd
+except ImportError:
+    pd = PlaceholderModule("pandas")
+
+
+@dataclass
+class _ModuleTreeNode:
+    event: _ProfilerEvent
+    parent: "_ModuleTreeNode | None" = None
+    children: list["_ModuleTreeNode"] = field(default_factory=list)
+    trace: str = ""
+
+    @property
+    def is_leaf(self):
+        return self.event.children is None or len(self.event.children) == 0
+
+    @property
+    def is_torch_op(self):
+        return event_is_torch_op(self.event)
+
+    @property
+    def is_cuda(self):
+        return (
+            self.event.tag == _EventType.Kineto
+            and self.event.typed[1].device_type == DeviceType.CUDA
+        )
+
+
+@dataclass
+class SummaryStatsEntry:
+    name: str
+    cuda_time_us: float
+    pct_cuda_time: float
+    invocations: int
+
+
+@dataclass
+class ModelStatsEntry:
+    name: str
+    cpu_time_us: float
+    cuda_time_us: float
+    pct_cuda_time: float
+    trace: str
+
+
+StatsEntry: TypeAlias = ModelStatsEntry | SummaryStatsEntry
+StatsEntryT = TypeVar("StatsEntryT", bound=StatsEntry)
+
+
+@dataclass
+class _StatsTreeNode(Generic[StatsEntryT]):
+    entry: StatsEntryT
+    children: list["_StatsTreeNode[StatsEntryT]"] = field(default_factory=list)
+    parent: "_StatsTreeNode[StatsEntryT] | None" = None
+
+
+@dataclass
+class LayerwiseProfileResults(profile):
+    _kineto_results: _ProfilerResult
+    _kineto_event_correlation_map: dict[int, list[_KinetoEvent]] = field(init=False)
+    _event_correlation_map: dict[int, list[FunctionEvent]] = field(init=False)
+    _module_tree: list[_ModuleTreeNode] = field(init=False)
+    _model_stats_tree: list[_StatsTreeNode[ModelStatsEntry]] = field(init=False)
+    _summary_stats_tree: list[_StatsTreeNode[SummaryStatsEntry]] = field(init=False)
+
+    # profile metadata
+    num_running_seqs: int | None = None
+
+    def __post_init__(self):
+        self._build_correlation_map()
+        self._build_module_tree()
+        self._build_stats_trees()
+
+    def print_model_table(self, column_widths: dict[str, int] | None = None):
+        _column_widths = dict(
+            name=60, cpu_time_us=12, cuda_time_us=12, pct_cuda_time=12, trace=60
+        )
+        if column_widths:
+            _column_widths.update(**column_widths)
+        filtered_model_table = [
+            (depth, row)
+            for depth, row in self._flatten_stats_tree(self._model_stats_tree)
+            if row.cuda_time_us > 0 or row.cpu_time_us > 0
+        ]
+        TablePrinter(ModelStatsEntry, _column_widths).print_table(
+            self._indent_row_names_based_on_depth(
+                filtered_model_table,
+                indent_style=lambda indent: "|" + "-" * indent + " ",
+            )
+        )
+
+    def print_summary_table(self, column_widths: dict[str, int] | None = None):
+        _column_widths = dict(
+            name=80, cuda_time_us=12, pct_cuda_time=12, invocations=15
+        )
+        if column_widths:
+            _column_widths.update(**column_widths)
+        filtered_summary_table = [
+            (depth, row)
+            for depth, row in self._flatten_stats_tree(self._summary_stats_tree)
+            if row.cuda_time_us > 0
+        ]
+        TablePrinter(SummaryStatsEntry, _column_widths).print_table(
+            self._indent_row_names_based_on_depth(
+                filtered_summary_table,
+                indent_style=lambda indent: "|" + "-" * indent + " ",
+            )
+        )
+
+    def export_model_stats_table_csv(self, filename: str):
+        df = pd.DataFrame(
+            [asdict(row) for _, row in self._flatten_stats_tree(self._model_stats_tree)]
+        )
+        df.to_csv(filename)
+
+    def export_summary_stats_table_csv(self, filename: str):
+        df = pd.DataFrame(
+            [
+                asdict(row)
+                for _, row in self._flatten_stats_tree(self._summary_stats_tree)
+            ]
+        )
+        df.to_csv(filename)
+
+    def convert_stats_to_dict(self) -> dict[str, Any]:
+        return {
+            "metadata": {"num_running_seqs": self.num_running_seqs},
+            "summary_stats": self._convert_stats_tree_to_dict(self._summary_stats_tree),
+            "model_stats": self._convert_stats_tree_to_dict(self._model_stats_tree),
+        }
+
+    @staticmethod
+    def _indent_row_names_based_on_depth(
+        depths_rows: list[tuple[int, StatsEntryT]],
+        indent_style: Callable[[int], str] | str = " ",
+    ):
+        indented_rows: list[StatsEntryT] = []
+        for depth, row in depths_rows:
+            if row.cuda_time_us == 0:
+                continue
+            indented_row: StatsEntryT = copy.deepcopy(row)
+            indented_row.name = indent_string(indented_row.name, depth, indent_style)
+            indented_rows.append(indented_row)
+        return indented_rows
+
+    def _build_correlation_map(self):
+        self._kineto_event_correlation_map = defaultdict(list)
+        for event in self._kineto_results.events():
+            self._kineto_event_correlation_map[event.correlation_id()].append(event)
+
+    def _build_module_tree(self):
+        self._module_tree = []
+        event_tree = self._kineto_results.experimental_event_tree()
+
+        def _df_traversal(
+            event: _ProfilerEvent, curr_node: _ModuleTreeNode | None = None
+        ):
+            # For the tensor parallel case for now only look at task 1
+            if event.start_tid != 1:
+                return
+
+            if event_has_module(event):
+                node = _ModuleTreeNode(event=event, parent=curr_node)
+                if curr_node:
+                    curr_node.children.append(node)
+                else:
+                    self._module_tree.append(node)
+                curr_node = node
+
+            is_leaf = event.children is None or len(event.children) == 0
+            if is_leaf and curr_node:
+                node = _ModuleTreeNode(
+                    event=event,
+                    parent=curr_node,
+                    trace=event_torch_op_stack_trace(
+                        event, until=lambda x: event_has_module(x)
+                    ),
+                )
+                curr_node.children.append(node)
+                curr_node = node
+
+            for child in event.children:
+                _df_traversal(child, curr_node)
+
+        for root in event_tree:
+            _df_traversal(root)
+
+    def _get_kineto_gpu_event(self, node: _ModuleTreeNode):
+        if node.event.tag != _EventType.Kineto:
+            return None
+        correlated_kineto_events = self._kineto_event_correlation_map.get(
+            node.event.correlation_id, []
+        )
+        iterator = (
+            x
+            for x in correlated_kineto_events
+            if x.device_type() == DeviceType.CUDA and x.name() == node.event.name
+        )
+        return next(iterator, None)
+
+    def _cumulative_cuda_time(self, node: _ModuleTreeNode):
+        "Return cuda time in microseconds"
+
+        def _cumulative_cuda_time_recursive(node: _ModuleTreeNode):
+            if node.is_leaf and (gpu_kineto_event := self._get_kineto_gpu_event(node)):
+                return gpu_kineto_event.duration_ns() / 1000.0
+            else:
+                cumulative_cuda_time = 0
+                for child in node.children:
+                    cumulative_cuda_time += _cumulative_cuda_time_recursive(child)
+                return cumulative_cuda_time
+
+        return _cumulative_cuda_time_recursive(node)
+
+    def _total_cuda_time(self):
+        return sum([self._cumulative_cuda_time(root) for root in self._module_tree])
+
+    def _build_stats_trees(self):
+        summary_dict: dict[tuple[str, ...], _StatsTreeNode[SummaryStatsEntry]] = {}
+        total_cuda_time = self._total_cuda_time()
+
+        def pct_cuda_time(cuda_time_us):
+            return (cuda_time_us / total_cuda_time) * 100
+
+        def build_summary_stats_tree_df(
+            node: _ModuleTreeNode,
+            parent: _StatsTreeNode[SummaryStatsEntry] | None = None,
+            summary_trace: tuple[str, ...] = (),
+        ) -> _StatsTreeNode[SummaryStatsEntry] | None:
+            if event_has_module(node.event):
+                name = event_module_repr(node.event)
+                cuda_time_us = self._cumulative_cuda_time(node)
+            elif gpu_kineto_event := self._get_kineto_gpu_event(node):
+                name = gpu_kineto_event.name()
+                cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
+            else:
+                return None
+
+            summary_trace = summary_trace + (name,)
+            if summary_trace in summary_dict:
+                entry = summary_dict[summary_trace].entry
+                entry.cuda_time_us += cuda_time_us
+                entry.invocations += 1
+                entry.pct_cuda_time = pct_cuda_time(entry.cuda_time_us)
+            else:
+                new_node = _StatsTreeNode(
+                    entry=SummaryStatsEntry(
+                        name=name,
+                        cuda_time_us=cuda_time_us,
+                        pct_cuda_time=pct_cuda_time(cuda_time_us),
+                        invocations=1,
+                    ),
+                    parent=parent,
+                )
+                if parent:
+                    parent.children.append(new_node)
+                summary_dict[summary_trace] = new_node
+
+            for child in node.children:
+                build_summary_stats_tree_df(
+                    child, summary_dict[summary_trace], summary_trace
+                )
+
+            return summary_dict[summary_trace]
+
+        self._summary_stats_tree = []
+        for root in self._module_tree:
+            summary_node = build_summary_stats_tree_df(root)
+            if summary_node is not None:
+                self._summary_stats_tree.append(summary_node)
+
+        def build_model_stats_tree_df(
+            node: _ModuleTreeNode,
+            parent: _StatsTreeNode[ModelStatsEntry] | None = None,
+        ) -> _StatsTreeNode[ModelStatsEntry] | None:
+            if event_has_module(
+                node.event,
+            ):
+                name = event_module_repr(node.event)
+                cuda_time_us = self._cumulative_cuda_time(node)
+                cpu_time_us = node.event.duration_time_ns / 1000
+                trace = ""
+            elif gpu_kineto_event := self._get_kineto_gpu_event(node):
+                name = gpu_kineto_event.name()
+                cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
+                cpu_time_us = 0
+                trace = node.trace
+            else:
+                return None
+
+            new_node = _StatsTreeNode(
+                entry=ModelStatsEntry(
+                    name=name,
+                    cpu_time_us=cpu_time_us,
+                    cuda_time_us=cuda_time_us,
+                    pct_cuda_time=pct_cuda_time(cuda_time_us),
+                    trace=trace,
+                ),
+                parent=parent,
+            )
+            if parent:
+                parent.children.append(new_node)
+
+            for child in node.children:
+                build_model_stats_tree_df(child, new_node)
+
+            return new_node
+
+        self._model_stats_tree = []
+        for root in self._module_tree:
+            model_node = build_model_stats_tree_df(root)
+            if model_node is not None:
+                self._model_stats_tree.append(model_node)
+
+    def _flatten_stats_tree(
+        self, tree: list[_StatsTreeNode[StatsEntryT]]
+    ) -> list[tuple[int, StatsEntryT]]:
+        entries: list[tuple[int, StatsEntryT]] = []
+
+        def df_traversal(node: _StatsTreeNode[StatsEntryT], depth: int = 0):
+            entries.append((depth, node.entry))
+            for child in node.children:
+                df_traversal(child, depth=depth + 1)
+
+        for root in tree:
+            df_traversal(root)
+
+        return entries
+
+    def _convert_stats_tree_to_dict(
+        self, tree: list[_StatsTreeNode[StatsEntryT]]
+    ) -> list[dict[str, Any]]:
+        root_dicts: list[dict[str, Any]] = []
+
+        def df_traversal(
+            node: _StatsTreeNode[StatsEntryT], curr_json_list: list[dict[str, Any]]
+        ):
+            curr_json_list.append({"entry": asdict(node.entry), "children": []})
+            for child in node.children:
+                df_traversal(child, curr_json_list[-1]["children"])
+
+        for root in tree:
+            df_traversal(root, root_dicts)
+
+        return root_dicts
+
+
+class layerwise_profile(profile):
+    def __init__(self, num_running_seqs: int | None = None):
+        """
+        layerwise profile constructor.
+
+        Args:
+            num_running_seqs (Optional[int], optional): When given,
+                num_running_seqs will be passed to LayerProfileResults
+                for metadata update. Defaults to None.
+        """
+        super().__init__(
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            record_shapes=True,
+            with_stack=True,
+            with_modules=True,
+            experimental_config=_ExperimentalConfig(verbose=True),
+        )
+
+        self.num_running_seqs = num_running_seqs
+
+    def __enter__(self):
+        return super().__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        super().__exit__(exc_type, exc_val, exc_tb)
+        self.results = LayerwiseProfileResults(
+            self.profiler.kineto_results, num_running_seqs=self.num_running_seqs
+        )
--- a/vllm/profiler/utils.py
+++ b/vllm/profiler/utils.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+from collections.abc import Callable
+
+from _typeshed import DataclassInstance
+from torch._C._profiler import _EventType, _ProfilerEvent, _TensorMetadata
+
+#
+# String / Print Manipulation
+#
+
+
+def trim_string_front(string: str, width: int) -> str:
+    if len(string) > width:
+        offset = len(string) - width + 3
+        string = string[offset:]
+        if len(string) > 3:
+            string = "..." + string[3:]
+    return string
+
+
+def trim_string_back(string: str, width: int) -> str:
+    if len(string) > width:
+        offset = len(string) - width + 3
+        string = string[:-offset]
+        if len(string) > 3:
+            string = string + "..."
+    return string
+
+
+class TablePrinter:
+    def __init__(self, row_cls: type[DataclassInstance], column_widths: dict[str, int]):
+        self.row_cls = row_cls
+        self.fieldnames = [x.name for x in dataclasses.fields(row_cls)]
+        self.column_widths = column_widths
+        assert set(self.column_widths.keys()) == set(self.fieldnames)
+
+    def print_table(self, rows: list[DataclassInstance]):
+        self._print_header()
+        self._print_line()
+        for row in rows:
+            self._print_row(row)
+
+    def _print_header(self):
+        for i, f in enumerate(self.fieldnames):
+            last = i == len(self.fieldnames) - 1
+            col_width = self.column_widths[f]
+            print(
+                trim_string_back(f, col_width).ljust(col_width),
+                end=" | " if not last else "\n",
+            )
+
+    def _print_row(self, row):
+        assert isinstance(row, self.row_cls)
+
+        for i, f in enumerate(self.fieldnames):
+            last = i == len(self.fieldnames) - 1
+            col_width = self.column_widths[f]
+            val = getattr(row, f)
+
+            val_str = ""
+            if isinstance(val, str):
+                val_str = trim_string_back(val, col_width).ljust(col_width)
+            elif type(val) in [float, int]:
+                val_str = f"{float(val):>.2f}".rjust(col_width)
+            else:
+                val_str = f"{val}".rjust(col_width)
+            print(val_str, end=" | " if not last else "\n")
+
+    def _print_line(self):
+        total_col_width = 0
+        for column_width in self.column_widths.values():
+            total_col_width += column_width
+        print("=" * (total_col_width + 3 * (len(self.column_widths) - 1)))
+
+
+def indent_string(
+    string: str, indent: int, indent_style: Callable[[int], str] | str = " "
+) -> str:
+    if indent:
+        if isinstance(indent_style, str):
+            return indent_style * indent + string
+        else:
+            return indent_style(indent) + string
+    else:
+        return string
+
+
+#
+# _ProfilerEvent utils
+#
+
+
+def event_has_module(event: _ProfilerEvent) -> bool:
+    event_type, typed_event = event.typed
+    if event_type == _EventType.PyCall:
+        return typed_event.module is not None
+    return False
+
+
+def event_is_torch_op(event: _ProfilerEvent) -> bool:
+    return event.tag == _EventType.TorchOp
+
+
+def event_arg_repr(arg) -> str:
+    if arg is None or type(arg) in [float, int, bool, str]:
+        return f"{arg}"
+    elif isinstance(arg, list):
+        return f"[{', '.join([event_arg_repr(x) for x in arg])}]"
+    elif isinstance(arg, tuple):
+        return f"({', '.join([event_arg_repr(x) for x in arg])})"
+    else:
+        assert isinstance(arg, _TensorMetadata), f"Unsupported type: {type(arg)}"
+        sizes_str = ", ".join([str(x) for x in arg.sizes])
+        return f"{str(arg.dtype).replace('torch.', '')}[{sizes_str}]"
+
+
+def event_torch_op_repr(event: _ProfilerEvent) -> str:
+    assert event.tag == _EventType.TorchOp
+    args_str = ", ".join([event_arg_repr(x) for x in event.typed[1].inputs])
+    return f"{event.name}({args_str})".replace("aten::", "")
+
+
+def event_module_repr(event: _ProfilerEvent) -> str:
+    assert event_has_module(event)
+    module = event.typed[1].module
+    if module.parameters and len(module.parameters) > 0:
+        args_str = ", ".join(
+            [f"{x[0]}={event_arg_repr(x[1])}" for x in module.parameters]
+        )
+        return f"{module.cls_name}({args_str})"
+    else:
+        return module.cls_name
+
+
+def event_torch_op_stack_trace(
+    curr_event: _ProfilerEvent, until: Callable[[_ProfilerEvent], bool]
+) -> str:
+    trace = ""
+    curr_event = curr_event.parent
+    while curr_event and not until(curr_event):
+        if event_is_torch_op(curr_event):
+            if len(trace) > 0:
+                trace += " <- "
+            trace += event_torch_op_repr(curr_event)
+        curr_event = curr_event.parent
+
+    return trace
--- a/vllm/profiler/wrapper.py
+++ b/vllm/profiler/wrapper.py
@@ -0,0 +1,254 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from collections.abc import Callable
+from contextlib import nullcontext
+from typing import Literal
+
+import torch
+from typing_extensions import override
+
+from vllm.config import ProfilerConfig
+from vllm.config.profiler import _is_uri_path
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class WorkerProfiler(ABC):
+    def __init__(self, profiler_config: ProfilerConfig) -> None:
+        self._delay_iters = profiler_config.delay_iterations
+        if self._delay_iters > 0:
+            logger.info_once(
+                "GPU profiling will start "
+                f"{self._delay_iters} steps after start_profile."
+            )
+
+        self._max_iters = profiler_config.max_iterations
+        if self._max_iters > 0:
+            logger.info_once(
+                "GPU profiling will stop "
+                f"after {self._max_iters} worker steps, "
+                "or when stop_profile is received."
+            )
+
+        # Track when the profiler gets triggered by start_profile
+        self._active_iteration_count = 0
+        self._active = False
+
+        # Track when the profiler is actually running
+        self._profiling_for_iters = 0
+        self._running = False
+
+    @abstractmethod
+    def _start(self) -> None:
+        """Start the profiler."""
+        pass
+
+    @abstractmethod
+    def _stop(self) -> None:
+        """Stop the profiler."""
+        pass
+
+    def _call_start(self) -> None:
+        """Call _start with error handling but no safeguards."""
+        try:
+            self._start()
+            self._running = True  # Only mark as running if start succeeds
+        except Exception as e:
+            logger.warning("Failed to start profiler: %s", e)
+
+    def _call_stop(self) -> None:
+        """Call _stop with error handling but no safeguards."""
+        try:
+            self._stop()
+            logger.info_once("Profiler stopped successfully.", scope="local")
+        except Exception as e:
+            logger.warning("Failed to stop profiler: %s", e)
+        self._running = False  # Always mark as not running, assume stop worked
+
+    def start(self) -> None:
+        """Attempt to start the profiler, accounting for delayed starts."""
+        if self._active:
+            logger.debug(
+                "start_profile received when profiler is already active. "
+                "Ignoring request."
+            )
+            return
+        self._active = True
+        if self._delay_iters == 0:
+            self._call_start()
+
+    def step(self) -> None:
+        """Update the profiler state at each worker step,
+        to handle delayed starts and max iteration limits."""
+        if not self._active:
+            return
+
+        self._active_iteration_count += 1
+
+        if (
+            not self._running
+            and self._delay_iters > 0
+            and self._active_iteration_count == self._delay_iters
+        ):
+            logger.info_once("Starting profiler after delay...", scope="local")
+            self._call_start()
+
+        if self._running:
+            self._profiling_for_iters += 1
+
+        if (
+            self._max_iters > 0
+            and self._running
+            and self._profiling_for_iters > self._max_iters
+        ):
+            # Automatically stop the profiler after max iters
+            # will be marked as not running, but leave as active so that stop
+            # can clean up properly
+            logger.info_once(
+                "Max profiling iterations reached. Stopping profiler...", scope="local"
+            )
+            self._call_stop()
+            return
+
+    def stop(self) -> None:
+        """Attempt to stop the profiler, accounting for overlapped calls."""
+        if not self._active:
+            logger.debug(
+                "stop_profile received when profiler is not active. Ignoring request."
+            )
+            return
+        self._active = False
+        self._active_iteration_count = 0
+        self._profiling_for_iters = 0
+
+        if self._running:
+            self._call_stop()
+
+    def shutdown(self) -> None:
+        """Ensure profiler is stopped when shutting down."""
+        logger.info_once("Shutting down profiler", scope="local")
+        if self._running:
+            self.stop()
+
+    def annotate_context_manager(self, name: str):
+        """Return a context manager to annotate profiler traces."""
+        return nullcontext()
+
+
+TorchProfilerActivity = Literal["CPU", "CUDA", "XPU"]
+TorchProfilerActivityMap = {
+    "CPU": torch.profiler.ProfilerActivity.CPU,
+    "CUDA": torch.profiler.ProfilerActivity.CUDA,
+    "XPU": torch.profiler.ProfilerActivity.XPU,
+}
+
+
+class TorchProfilerWrapper(WorkerProfiler):
+    def __init__(
+        self,
+        profiler_config: ProfilerConfig,
+        worker_name: str,
+        local_rank: int,
+        activities: list[TorchProfilerActivity],
+        on_trace_ready: Callable[[torch.profiler.profile], None] | None = None,
+    ) -> None:
+        super().__init__(profiler_config)
+
+        self.local_rank = local_rank
+        self.profiler_config = profiler_config
+        torch_profiler_trace_dir = profiler_config.torch_profiler_dir
+        if local_rank in (None, 0):
+            logger.info_once(
+                "Torch profiling enabled. Traces will be saved to: %s",
+                torch_profiler_trace_dir,
+                scope="local",
+            )
+            logger.debug(
+                "Profiler config: record_shapes=%s,"
+                "profile_memory=%s,with_stack=%s,with_flops=%s",
+                profiler_config.torch_profiler_record_shapes,
+                profiler_config.torch_profiler_with_memory,
+                profiler_config.torch_profiler_with_stack,
+                profiler_config.torch_profiler_with_flops,
+            )
+
+        # Determine trace handler: use custom handler if provided,
+        # otherwise default to tensorboard trace handler
+        if on_trace_ready is not None:
+            trace_handler = on_trace_ready
+        else:
+            trace_handler = torch.profiler.tensorboard_trace_handler(
+                torch_profiler_trace_dir,
+                worker_name=worker_name,
+                use_gzip=profiler_config.torch_profiler_use_gzip,
+            )
+
+        self.dump_cpu_time_total = "CPU" in activities and len(activities) == 1
+        self.profiler = torch.profiler.profile(
+            activities=[TorchProfilerActivityMap[activity] for activity in activities],
+            record_shapes=profiler_config.torch_profiler_record_shapes,
+            profile_memory=profiler_config.torch_profiler_with_memory,
+            with_stack=profiler_config.torch_profiler_with_stack,
+            with_flops=profiler_config.torch_profiler_with_flops,
+            on_trace_ready=trace_handler,
+        )
+
+    @override
+    def _start(self) -> None:
+        self.profiler.start()
+
+    @override
+    def _stop(self) -> None:
+        self.profiler.stop()
+
+        profiler_config = self.profiler_config
+        rank = self.local_rank
+        if profiler_config.torch_profiler_dump_cuda_time_total:
+            profiler_dir = profiler_config.torch_profiler_dir
+            sort_key = "self_cuda_time_total"
+            table = self.profiler.key_averages().table(sort_by=sort_key)
+
+            # Skip file write for URI paths (gs://, s3://, etc.)
+            # as standard file I/O doesn't work with URI schemes
+            if not _is_uri_path(profiler_dir):
+                profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
+                with open(profiler_out_file, "w") as f:
+                    print(table, file=f)
+
+            # only print profiler results on rank 0
+            if rank == 0:
+                print(table)
+        if self.dump_cpu_time_total and rank == 0:
+            logger.info(
+                self.profiler.key_averages().table(
+                    sort_by="self_cpu_time_total", row_limit=50
+                )
+            )
+
+    @override
+    def annotate_context_manager(self, name: str):
+        return torch.profiler.record_function(name)
+
+
+class CudaProfilerWrapper(WorkerProfiler):
+    def __init__(self, profiler_config: ProfilerConfig) -> None:
+        super().__init__(profiler_config)
+        # Note: lazy import to avoid dependency issues if CUDA is not available.
+        import torch.cuda.profiler as cuda_profiler
+
+        self._cuda_profiler = cuda_profiler
+
+    @override
+    def _start(self) -> None:
+        self._cuda_profiler.start()
+
+    @override
+    def _stop(self) -> None:
+        self._cuda_profiler.stop()
+
+    @override
+    def annotate_context_manager(self, name: str):
+        return torch.cuda.nvtx.range(name)