Sync from v0.13
This commit is contained in:
0
vllm/profiler/__init__.py
Normal file
0
vllm/profiler/__init__.py
Normal file
392
vllm/profiler/layerwise_profile.py
Normal file
392
vllm/profiler/layerwise_profile.py
Normal file
@@ -0,0 +1,392 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import copy
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from typing import Any, Optional, TypeAlias
|
||||
|
||||
from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult
|
||||
from torch._C._profiler import _EventType, _ExperimentalConfig, _ProfilerEvent
|
||||
from torch.autograd.profiler import FunctionEvent
|
||||
from torch.profiler import ProfilerActivity, profile
|
||||
|
||||
from vllm.profiler.utils import (
|
||||
TablePrinter,
|
||||
event_has_module,
|
||||
event_is_torch_op,
|
||||
event_module_repr,
|
||||
event_torch_op_stack_trace,
|
||||
indent_string,
|
||||
)
|
||||
from vllm.utils.import_utils import PlaceholderModule
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
pd = PlaceholderModule("pandas")
|
||||
|
||||
|
||||
@dataclass
|
||||
class _ModuleTreeNode:
|
||||
event: _ProfilerEvent
|
||||
parent: Optional["_ModuleTreeNode"] = None
|
||||
children: list["_ModuleTreeNode"] = field(default_factory=list)
|
||||
trace: str = ""
|
||||
|
||||
@property
|
||||
def is_leaf(self):
|
||||
return self.event.children is None or len(self.event.children) == 0
|
||||
|
||||
@property
|
||||
def is_torch_op(self):
|
||||
return event_is_torch_op(self.event)
|
||||
|
||||
@property
|
||||
def is_cuda(self):
|
||||
return (
|
||||
self.event.tag == _EventType.Kineto
|
||||
and self.event.typed[1].device_type == DeviceType.CUDA
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SummaryStatsEntry:
|
||||
name: str
|
||||
cuda_time_us: float
|
||||
pct_cuda_time: float
|
||||
invocations: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelStatsEntry:
|
||||
name: str
|
||||
cpu_time_us: float
|
||||
cuda_time_us: float
|
||||
pct_cuda_time: float
|
||||
trace: str
|
||||
|
||||
|
||||
StatsEntry: TypeAlias = ModelStatsEntry | SummaryStatsEntry
|
||||
|
||||
|
||||
@dataclass
|
||||
class _StatsTreeNode:
|
||||
entry: StatsEntry
|
||||
children: list[StatsEntry]
|
||||
parent: StatsEntry | None
|
||||
|
||||
|
||||
@dataclass
|
||||
class LayerwiseProfileResults(profile):
|
||||
_kineto_results: _ProfilerResult
|
||||
_kineto_event_correlation_map: dict[int, list[_KinetoEvent]] = field(init=False)
|
||||
_event_correlation_map: dict[int, list[FunctionEvent]] = field(init=False)
|
||||
_module_tree: list[_ModuleTreeNode] = field(init=False)
|
||||
_model_stats_tree: list[_StatsTreeNode] = field(init=False)
|
||||
_summary_stats_tree: list[_StatsTreeNode] = field(init=False)
|
||||
|
||||
# profile metadata
|
||||
num_running_seqs: int | None = None
|
||||
|
||||
def __post_init__(self):
|
||||
self._build_correlation_map()
|
||||
self._build_module_tree()
|
||||
self._build_stats_trees()
|
||||
|
||||
def print_model_table(self, column_widths: dict[str, int] = None):
|
||||
_column_widths = dict(
|
||||
name=60, cpu_time_us=12, cuda_time_us=12, pct_cuda_time=12, trace=60
|
||||
)
|
||||
if column_widths:
|
||||
_column_widths.update(**column_widths)
|
||||
filtered_model_table = [
|
||||
(depth, row)
|
||||
for depth, row in self._flatten_stats_tree(self._model_stats_tree)
|
||||
if row.cuda_time_us > 0 or row.cpu_time_us > 0
|
||||
]
|
||||
TablePrinter(ModelStatsEntry, _column_widths).print_table(
|
||||
self._indent_row_names_based_on_depth(
|
||||
filtered_model_table,
|
||||
indent_style=lambda indent: "|" + "-" * indent + " ",
|
||||
)
|
||||
)
|
||||
|
||||
def print_summary_table(self, column_widths: dict[str, int] = None):
|
||||
_column_widths = dict(
|
||||
name=80, cuda_time_us=12, pct_cuda_time=12, invocations=15
|
||||
)
|
||||
if column_widths:
|
||||
_column_widths.update(**column_widths)
|
||||
filtered_summary_table = [
|
||||
(depth, row)
|
||||
for depth, row in self._flatten_stats_tree(self._summary_stats_tree)
|
||||
if row.cuda_time_us > 0
|
||||
]
|
||||
TablePrinter(SummaryStatsEntry, _column_widths).print_table(
|
||||
self._indent_row_names_based_on_depth(
|
||||
filtered_summary_table,
|
||||
indent_style=lambda indent: "|" + "-" * indent + " ",
|
||||
)
|
||||
)
|
||||
|
||||
def export_model_stats_table_csv(self, filename: str):
|
||||
df = pd.DataFrame(
|
||||
[asdict(row) for _, row in self._flatten_stats_tree(self._model_stats_tree)]
|
||||
)
|
||||
df.to_csv(filename)
|
||||
|
||||
def export_summary_stats_table_csv(self, filename: str):
|
||||
df = pd.DataFrame(
|
||||
[
|
||||
asdict(row)
|
||||
for _, row in self._flatten_stats_tree(self._summary_stats_tree)
|
||||
]
|
||||
)
|
||||
df.to_csv(filename)
|
||||
|
||||
def convert_stats_to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"metadata": {"num_running_seqs": self.num_running_seqs},
|
||||
"summary_stats": self._convert_stats_tree_to_dict(self._summary_stats_tree),
|
||||
"model_stats": self._convert_stats_tree_to_dict(self._model_stats_tree),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _indent_row_names_based_on_depth(
|
||||
depths_rows: list[tuple[int, StatsEntry]],
|
||||
indent_style: Callable[[int], str] | str = " ",
|
||||
):
|
||||
indented_rows = []
|
||||
for depth, row in depths_rows:
|
||||
if row.cuda_time_us == 0:
|
||||
continue
|
||||
indented_row = copy.deepcopy(row)
|
||||
indented_row.name = indent_string(indented_row.name, depth, indent_style)
|
||||
indented_rows.append(indented_row)
|
||||
return indented_rows
|
||||
|
||||
def _build_correlation_map(self):
|
||||
self._kineto_event_correlation_map = defaultdict(list)
|
||||
for event in self._kineto_results.events():
|
||||
self._kineto_event_correlation_map[event.correlation_id()].append(event)
|
||||
|
||||
def _build_module_tree(self):
|
||||
self._module_tree = []
|
||||
event_tree = self._kineto_results.experimental_event_tree()
|
||||
|
||||
def _df_traversal(
|
||||
event: _ProfilerEvent, curr_node: _ModuleTreeNode | None = None
|
||||
):
|
||||
# For the tensor parallel case for now only look at task 1
|
||||
if event.start_tid != 1:
|
||||
return
|
||||
|
||||
if event_has_module(event):
|
||||
node = _ModuleTreeNode(event=event, parent=curr_node)
|
||||
if curr_node:
|
||||
curr_node.children.append(node)
|
||||
else:
|
||||
self._module_tree.append(node)
|
||||
curr_node = node
|
||||
|
||||
is_leaf = event.children is None or len(event.children) == 0
|
||||
if is_leaf and curr_node:
|
||||
node = _ModuleTreeNode(
|
||||
event=event,
|
||||
parent=curr_node,
|
||||
trace=event_torch_op_stack_trace(
|
||||
event, until=lambda x: event_has_module(x)
|
||||
),
|
||||
)
|
||||
curr_node.children.append(node)
|
||||
curr_node = node
|
||||
|
||||
for child in event.children:
|
||||
_df_traversal(child, curr_node)
|
||||
|
||||
for root in event_tree:
|
||||
_df_traversal(root)
|
||||
|
||||
def _get_kineto_gpu_event(self, node: _ModuleTreeNode):
|
||||
if node.event.tag != _EventType.Kineto:
|
||||
return None
|
||||
correlated_kineto_events = self._kineto_event_correlation_map.get(
|
||||
node.event.correlation_id, []
|
||||
)
|
||||
iterator = (
|
||||
x
|
||||
for x in correlated_kineto_events
|
||||
if x.device_type() == DeviceType.CUDA and x.name() == node.event.name
|
||||
)
|
||||
return next(iterator, None)
|
||||
|
||||
def _cumulative_cuda_time(self, node: _ModuleTreeNode):
|
||||
"Return cuda time in microseconds"
|
||||
|
||||
def _cumulative_cuda_time_recursive(node: _ModuleTreeNode):
|
||||
if node.is_leaf and (gpu_kineto_event := self._get_kineto_gpu_event(node)):
|
||||
return gpu_kineto_event.duration_ns() / 1000.0
|
||||
else:
|
||||
cumulative_cuda_time = 0
|
||||
for child in node.children:
|
||||
cumulative_cuda_time += _cumulative_cuda_time_recursive(child)
|
||||
return cumulative_cuda_time
|
||||
|
||||
return _cumulative_cuda_time_recursive(node)
|
||||
|
||||
def _total_cuda_time(self):
|
||||
return sum([self._cumulative_cuda_time(root) for root in self._module_tree])
|
||||
|
||||
def _build_stats_trees(self):
|
||||
summary_dict: dict[str, _StatsTreeNode] = {}
|
||||
total_cuda_time = self._total_cuda_time()
|
||||
|
||||
def pct_cuda_time(cuda_time_us):
|
||||
return (cuda_time_us / total_cuda_time) * 100
|
||||
|
||||
def build_summary_stats_tree_df(
|
||||
node: _ModuleTreeNode,
|
||||
parent: _StatsTreeNode | None = None,
|
||||
summary_trace: tuple[str] = (),
|
||||
):
|
||||
if event_has_module(node.event):
|
||||
name = event_module_repr(node.event)
|
||||
cuda_time_us = self._cumulative_cuda_time(node)
|
||||
elif gpu_kineto_event := self._get_kineto_gpu_event(node):
|
||||
name = gpu_kineto_event.name()
|
||||
cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
|
||||
else:
|
||||
return None
|
||||
|
||||
summary_trace = summary_trace + (name,)
|
||||
if summary_trace in summary_dict:
|
||||
entry = summary_dict[summary_trace].entry
|
||||
entry.cuda_time_us += cuda_time_us
|
||||
entry.invocations += 1
|
||||
entry.pct_cuda_time = pct_cuda_time(entry.cuda_time_us)
|
||||
else:
|
||||
new_node = _StatsTreeNode(
|
||||
entry=SummaryStatsEntry(
|
||||
name=name,
|
||||
cuda_time_us=cuda_time_us,
|
||||
pct_cuda_time=pct_cuda_time(cuda_time_us),
|
||||
invocations=1,
|
||||
),
|
||||
children=[],
|
||||
parent=parent,
|
||||
)
|
||||
if parent:
|
||||
parent.children.append(new_node)
|
||||
summary_dict[summary_trace] = new_node
|
||||
|
||||
for child in node.children:
|
||||
build_summary_stats_tree_df(
|
||||
child, summary_dict[summary_trace], summary_trace
|
||||
)
|
||||
|
||||
return summary_dict[summary_trace]
|
||||
|
||||
self._summary_stats_tree = []
|
||||
for root in self._module_tree:
|
||||
self._summary_stats_tree.append(build_summary_stats_tree_df(root))
|
||||
|
||||
def build_model_stats_tree_df(
|
||||
node: _ModuleTreeNode, parent: _StatsTreeNode | None = None
|
||||
):
|
||||
if event_has_module(
|
||||
node.event,
|
||||
):
|
||||
name = event_module_repr(node.event)
|
||||
cuda_time_us = self._cumulative_cuda_time(node)
|
||||
cpu_time_us = node.event.duration_time_ns / 1000
|
||||
trace = ""
|
||||
elif gpu_kineto_event := self._get_kineto_gpu_event(node):
|
||||
name = gpu_kineto_event.name()
|
||||
cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
|
||||
cpu_time_us = 0
|
||||
trace = node.trace
|
||||
else:
|
||||
return None
|
||||
|
||||
new_node = _StatsTreeNode(
|
||||
entry=ModelStatsEntry(
|
||||
name=name,
|
||||
cpu_time_us=cpu_time_us,
|
||||
cuda_time_us=cuda_time_us,
|
||||
pct_cuda_time=pct_cuda_time(cuda_time_us),
|
||||
trace=trace,
|
||||
),
|
||||
parent=parent,
|
||||
children=[],
|
||||
)
|
||||
if parent:
|
||||
parent.children.append(new_node)
|
||||
|
||||
for child in node.children:
|
||||
build_model_stats_tree_df(child, new_node)
|
||||
|
||||
return new_node
|
||||
|
||||
self._model_stats_tree = []
|
||||
for root in self._module_tree:
|
||||
self._model_stats_tree.append(build_model_stats_tree_df(root))
|
||||
|
||||
def _flatten_stats_tree(
|
||||
self, tree: list[_StatsTreeNode]
|
||||
) -> list[tuple[int, StatsEntry]]:
|
||||
entries: list[tuple[int, StatsEntry]] = []
|
||||
|
||||
def df_traversal(node: _StatsTreeNode, depth=0):
|
||||
entries.append((depth, node.entry))
|
||||
for child in node.children:
|
||||
df_traversal(child, depth=depth + 1)
|
||||
|
||||
for root in tree:
|
||||
df_traversal(root)
|
||||
|
||||
return entries
|
||||
|
||||
def _convert_stats_tree_to_dict(self, tree: list[_StatsTreeNode]) -> list[dict]:
|
||||
root_dicts: list[dict] = []
|
||||
|
||||
def df_traversal(node: _StatsTreeNode, curr_json_list: list[dict]):
|
||||
curr_json_list.append({"entry": asdict(node.entry), "children": []})
|
||||
for child in node.children:
|
||||
df_traversal(child, curr_json_list[-1]["children"])
|
||||
|
||||
for root in tree:
|
||||
df_traversal(root, root_dicts)
|
||||
|
||||
return root_dicts
|
||||
|
||||
|
||||
class layerwise_profile(profile):
|
||||
def __init__(self, num_running_seqs: int | None = None):
|
||||
"""
|
||||
layerwise profile constructor.
|
||||
|
||||
Args:
|
||||
num_running_seqs (Optional[int], optional): When given,
|
||||
num_running_seqs will be passed to LayerProfileResults
|
||||
for metadata update. Defaults to None.
|
||||
"""
|
||||
super().__init__(
|
||||
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
|
||||
record_shapes=True,
|
||||
with_stack=True,
|
||||
with_modules=True,
|
||||
experimental_config=_ExperimentalConfig(verbose=True),
|
||||
)
|
||||
|
||||
self.num_running_seqs = num_running_seqs
|
||||
|
||||
def __enter__(self):
|
||||
return super().__enter__()
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
super().__exit__(exc_type, exc_val, exc_tb)
|
||||
self.results = LayerwiseProfileResults(
|
||||
self.profiler.kineto_results, num_running_seqs=self.num_running_seqs
|
||||
)
|
||||
151
vllm/profiler/utils.py
Normal file
151
vllm/profiler/utils.py
Normal file
@@ -0,0 +1,151 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import dataclasses
|
||||
from collections.abc import Callable
|
||||
|
||||
from torch._C._profiler import _EventType, _ProfilerEvent, _TensorMetadata
|
||||
|
||||
#
|
||||
# String / Print Manipulation
|
||||
#
|
||||
|
||||
|
||||
def trim_string_front(string, width):
|
||||
if len(string) > width:
|
||||
offset = len(string) - width + 3
|
||||
string = string[offset:]
|
||||
if len(string) > 3:
|
||||
string = "..." + string[3:]
|
||||
return string
|
||||
|
||||
|
||||
def trim_string_back(string, width):
|
||||
if len(string) > width:
|
||||
offset = len(string) - width + 3
|
||||
string = string[:-offset]
|
||||
if len(string) > 3:
|
||||
string = string + "..."
|
||||
return string
|
||||
|
||||
|
||||
class TablePrinter:
|
||||
def __init__(
|
||||
self, row_cls: type[dataclasses.dataclass], column_widths: dict[str, int]
|
||||
):
|
||||
self.row_cls = row_cls
|
||||
self.fieldnames = [x.name for x in dataclasses.fields(row_cls)]
|
||||
self.column_widths = column_widths
|
||||
assert set(self.column_widths.keys()) == set(self.fieldnames)
|
||||
|
||||
def print_table(self, rows: list[dataclasses.dataclass]):
|
||||
self._print_header()
|
||||
self._print_line()
|
||||
for row in rows:
|
||||
self._print_row(row)
|
||||
|
||||
def _print_header(self):
|
||||
for i, f in enumerate(self.fieldnames):
|
||||
last = i == len(self.fieldnames) - 1
|
||||
col_width = self.column_widths[f]
|
||||
print(
|
||||
trim_string_back(f, col_width).ljust(col_width),
|
||||
end=" | " if not last else "\n",
|
||||
)
|
||||
|
||||
def _print_row(self, row):
|
||||
assert isinstance(row, self.row_cls)
|
||||
|
||||
for i, f in enumerate(self.fieldnames):
|
||||
last = i == len(self.fieldnames) - 1
|
||||
col_width = self.column_widths[f]
|
||||
val = getattr(row, f)
|
||||
|
||||
val_str = ""
|
||||
if isinstance(val, str):
|
||||
val_str = trim_string_back(val, col_width).ljust(col_width)
|
||||
elif type(val) in [float, int]:
|
||||
val_str = f"{float(val):>.2f}".rjust(col_width)
|
||||
else:
|
||||
val_str = f"{val}".rjust(col_width)
|
||||
print(val_str, end=" | " if not last else "\n")
|
||||
|
||||
def _print_line(self):
|
||||
total_col_width = 0
|
||||
for column_width in self.column_widths.values():
|
||||
total_col_width += column_width
|
||||
print("=" * (total_col_width + 3 * (len(self.column_widths) - 1)))
|
||||
|
||||
|
||||
def indent_string(
|
||||
string: str, indent: int, indent_style: Callable[[int], str] | str = " "
|
||||
) -> str:
|
||||
if indent:
|
||||
if isinstance(indent_style, str):
|
||||
return indent_style * indent + string
|
||||
else:
|
||||
return indent_style(indent) + string
|
||||
else:
|
||||
return string
|
||||
|
||||
|
||||
#
|
||||
# _ProfilerEvent utils
|
||||
#
|
||||
|
||||
|
||||
def event_has_module(event: _ProfilerEvent) -> bool:
|
||||
event_type, typed_event = event.typed
|
||||
if event_type == _EventType.PyCall:
|
||||
return typed_event.module is not None
|
||||
return False
|
||||
|
||||
|
||||
def event_is_torch_op(event: _ProfilerEvent) -> bool:
|
||||
return event.tag == _EventType.TorchOp
|
||||
|
||||
|
||||
def event_arg_repr(arg) -> str:
|
||||
if arg is None or type(arg) in [float, int, bool, str]:
|
||||
return f"{arg}"
|
||||
elif isinstance(arg, list):
|
||||
return f"[{', '.join([event_arg_repr(x) for x in arg])}]"
|
||||
elif isinstance(arg, tuple):
|
||||
return f"({', '.join([event_arg_repr(x) for x in arg])})"
|
||||
else:
|
||||
assert isinstance(arg, _TensorMetadata), f"Unsupported type: {type(arg)}"
|
||||
sizes_str = ", ".join([str(x) for x in arg.sizes])
|
||||
return f"{str(arg.dtype).replace('torch.', '')}[{sizes_str}]"
|
||||
|
||||
|
||||
def event_torch_op_repr(event: _ProfilerEvent) -> str:
|
||||
assert event.tag == _EventType.TorchOp
|
||||
args_str = ", ".join([event_arg_repr(x) for x in event.typed[1].inputs])
|
||||
return f"{event.name}({args_str})".replace("aten::", "")
|
||||
|
||||
|
||||
def event_module_repr(event: _ProfilerEvent) -> str:
|
||||
assert event_has_module(event)
|
||||
module = event.typed[1].module
|
||||
if module.parameters and len(module.parameters) > 0:
|
||||
args_str = ", ".join(
|
||||
[f"{x[0]}={event_arg_repr(x[1])}" for x in module.parameters]
|
||||
)
|
||||
return f"{module.cls_name}({args_str})"
|
||||
else:
|
||||
return module.cls_name
|
||||
|
||||
|
||||
def event_torch_op_stack_trace(
|
||||
curr_event: _ProfilerEvent, until: Callable[[_ProfilerEvent], bool]
|
||||
) -> str:
|
||||
trace = ""
|
||||
curr_event = curr_event.parent
|
||||
while curr_event and not until(curr_event):
|
||||
if event_is_torch_op(curr_event):
|
||||
if len(trace) > 0:
|
||||
trace += " <- "
|
||||
trace += event_torch_op_repr(curr_event)
|
||||
curr_event = curr_event.parent
|
||||
|
||||
return trace
|
||||
241
vllm/profiler/wrapper.py
Normal file
241
vllm/profiler/wrapper.py
Normal file
@@ -0,0 +1,241 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from contextlib import nullcontext
|
||||
from typing import Literal
|
||||
|
||||
import torch
|
||||
from typing_extensions import override
|
||||
|
||||
from vllm.config import ProfilerConfig
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class WorkerProfiler(ABC):
|
||||
def __init__(self, profiler_config: ProfilerConfig) -> None:
|
||||
self._delay_iters = profiler_config.delay_iterations
|
||||
if self._delay_iters > 0:
|
||||
logger.info_once(
|
||||
"GPU profiling will start "
|
||||
f"{self._delay_iters} steps after start_profile."
|
||||
)
|
||||
|
||||
self._max_iters = profiler_config.max_iterations
|
||||
if self._max_iters > 0:
|
||||
logger.info_once(
|
||||
"GPU profiling will stop "
|
||||
f"after {self._max_iters} worker steps, "
|
||||
"or when stop_profile is received."
|
||||
)
|
||||
|
||||
# Track when the profiler gets triggered by start_profile
|
||||
self._active_iteration_count = 0
|
||||
self._active = False
|
||||
|
||||
# Track when the profiler is actually running
|
||||
self._profiling_for_iters = 0
|
||||
self._running = False
|
||||
|
||||
@abstractmethod
|
||||
def _start(self) -> None:
|
||||
"""Start the profiler."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _stop(self) -> None:
|
||||
"""Stop the profiler."""
|
||||
pass
|
||||
|
||||
def _call_start(self) -> None:
|
||||
"""Call _start with error handling but no safeguards."""
|
||||
try:
|
||||
self._start()
|
||||
self._running = True # Only mark as running if start succeeds
|
||||
except Exception as e:
|
||||
logger.warning("Failed to start profiler: %s", e)
|
||||
|
||||
def _call_stop(self) -> None:
|
||||
"""Call _stop with error handling but no safeguards."""
|
||||
try:
|
||||
self._stop()
|
||||
logger.info_once("Profiler stopped successfully.", scope="local")
|
||||
except Exception as e:
|
||||
logger.warning("Failed to stop profiler: %s", e)
|
||||
self._running = False # Always mark as not running, assume stop worked
|
||||
|
||||
def start(self) -> None:
|
||||
"""Attempt to start the profiler, accounting for delayed starts."""
|
||||
if self._active:
|
||||
logger.debug(
|
||||
"start_profile received when profiler is already active. "
|
||||
"Ignoring request."
|
||||
)
|
||||
return
|
||||
self._active = True
|
||||
if self._delay_iters == 0:
|
||||
self._call_start()
|
||||
|
||||
def step(self) -> None:
|
||||
"""Update the profiler state at each worker step,
|
||||
to handle delayed starts and max iteration limits."""
|
||||
if not self._active:
|
||||
return
|
||||
|
||||
self._active_iteration_count += 1
|
||||
|
||||
if (
|
||||
not self._running
|
||||
and self._delay_iters > 0
|
||||
and self._active_iteration_count == self._delay_iters
|
||||
):
|
||||
logger.info_once("Starting profiler after delay...", scope="local")
|
||||
self._call_start()
|
||||
|
||||
if self._running:
|
||||
self._profiling_for_iters += 1
|
||||
|
||||
if (
|
||||
self._max_iters > 0
|
||||
and self._running
|
||||
and self._profiling_for_iters > self._max_iters
|
||||
):
|
||||
# Automatically stop the profiler after max iters
|
||||
# will be marked as not running, but leave as active so that stop
|
||||
# can clean up properly
|
||||
logger.info_once(
|
||||
"Max profiling iterations reached. Stopping profiler...", scope="local"
|
||||
)
|
||||
self._call_stop()
|
||||
return
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Attempt to stop the profiler, accounting for overlapped calls."""
|
||||
if not self._active:
|
||||
logger.debug(
|
||||
"stop_profile received when profiler is not active. Ignoring request."
|
||||
)
|
||||
return
|
||||
self._active = False
|
||||
self._active_iteration_count = 0
|
||||
self._profiling_for_iters = 0
|
||||
|
||||
if self._running:
|
||||
self._call_stop()
|
||||
|
||||
def shutdown(self) -> None:
|
||||
"""Ensure profiler is stopped when shutting down."""
|
||||
logger.info_once("Shutting down profiler", scope="local")
|
||||
if self._running:
|
||||
self.stop()
|
||||
|
||||
def annotate_context_manager(self, name: str):
|
||||
"""Return a context manager to annotate profiler traces."""
|
||||
return nullcontext()
|
||||
|
||||
|
||||
TorchProfilerActivity = Literal["CPU", "CUDA", "XPU"]
|
||||
TorchProfilerActivityMap = {
|
||||
"CPU": torch.profiler.ProfilerActivity.CPU,
|
||||
"CUDA": torch.profiler.ProfilerActivity.CUDA,
|
||||
"XPU": torch.profiler.ProfilerActivity.XPU,
|
||||
}
|
||||
|
||||
|
||||
class TorchProfilerWrapper(WorkerProfiler):
|
||||
def __init__(
|
||||
self,
|
||||
profiler_config: ProfilerConfig,
|
||||
worker_name: str,
|
||||
local_rank: int,
|
||||
activities: list[TorchProfilerActivity],
|
||||
) -> None:
|
||||
super().__init__(profiler_config)
|
||||
|
||||
self.local_rank = local_rank
|
||||
self.profiler_config = profiler_config
|
||||
torch_profiler_trace_dir = profiler_config.torch_profiler_dir
|
||||
if local_rank in (None, 0):
|
||||
logger.info_once(
|
||||
"Torch profiling enabled. Traces will be saved to: %s",
|
||||
torch_profiler_trace_dir,
|
||||
scope="local",
|
||||
)
|
||||
logger.debug(
|
||||
"Profiler config: record_shapes=%s,"
|
||||
"profile_memory=%s,with_stack=%s,with_flops=%s",
|
||||
profiler_config.torch_profiler_record_shapes,
|
||||
profiler_config.torch_profiler_with_memory,
|
||||
profiler_config.torch_profiler_with_stack,
|
||||
profiler_config.torch_profiler_with_flops,
|
||||
)
|
||||
|
||||
self.dump_cpu_time_total = "CPU" in activities and len(activities) == 1
|
||||
self.profiler = torch.profiler.profile(
|
||||
activities=[TorchProfilerActivityMap[activity] for activity in activities],
|
||||
record_shapes=profiler_config.torch_profiler_record_shapes,
|
||||
profile_memory=profiler_config.torch_profiler_with_memory,
|
||||
with_stack=profiler_config.torch_profiler_with_stack,
|
||||
with_flops=profiler_config.torch_profiler_with_flops,
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
torch_profiler_trace_dir,
|
||||
worker_name=worker_name,
|
||||
use_gzip=profiler_config.torch_profiler_use_gzip,
|
||||
),
|
||||
)
|
||||
|
||||
@override
|
||||
def _start(self) -> None:
|
||||
self.profiler.start()
|
||||
|
||||
@override
|
||||
def _stop(self) -> None:
|
||||
self.profiler.stop()
|
||||
|
||||
profiler_config = self.profiler_config
|
||||
rank = self.local_rank
|
||||
if profiler_config.torch_profiler_dump_cuda_time_total:
|
||||
profiler_dir = profiler_config.torch_profiler_dir
|
||||
profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
|
||||
sort_key = "self_cuda_time_total"
|
||||
table = self.profiler.key_averages().table(sort_by=sort_key)
|
||||
|
||||
with open(profiler_out_file, "w") as f:
|
||||
print(table, file=f)
|
||||
|
||||
# only print profiler results on rank 0
|
||||
if rank == 0:
|
||||
print(table)
|
||||
if self.dump_cpu_time_total and rank == 0:
|
||||
logger.info(
|
||||
self.profiler.key_averages().table(
|
||||
sort_by="self_cpu_time_total", row_limit=50
|
||||
)
|
||||
)
|
||||
|
||||
@override
|
||||
def annotate_context_manager(self, name: str):
|
||||
return torch.profiler.record_function(name)
|
||||
|
||||
|
||||
class CudaProfilerWrapper(WorkerProfiler):
|
||||
def __init__(self, profiler_config: ProfilerConfig) -> None:
|
||||
super().__init__(profiler_config)
|
||||
# Note: lazy import to avoid dependency issues if CUDA is not available.
|
||||
import torch.cuda.profiler as cuda_profiler
|
||||
|
||||
self._cuda_profiler = cuda_profiler
|
||||
|
||||
@override
|
||||
def _start(self) -> None:
|
||||
self._cuda_profiler.start()
|
||||
|
||||
@override
|
||||
def _stop(self) -> None:
|
||||
self._cuda_profiler.stop()
|
||||
|
||||
@override
|
||||
def annotate_context_manager(self, name: str):
|
||||
return torch.cuda.nvtx.range(name)
|
||||
Reference in New Issue
Block a user