This commit is contained in:
root
2026-03-05 18:06:10 +08:00
commit 809cecae09
2569 changed files with 478204 additions and 0 deletions

0
profiler/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

37
profiler/gpu_profiler.py Normal file
View File

@@ -0,0 +1,37 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.logger import init_logger
logger = init_logger(__name__)
class CudaProfilerWrapper:
def __init__(self) -> None:
self._profiler_running = False
# Note: lazy import to avoid dependency issues if CUDA is not available.
import torch.cuda.profiler as cuda_profiler
self._cuda_profiler = cuda_profiler
def start(self) -> None:
try:
self._cuda_profiler.start()
self._profiler_running = True
logger.info_once("Started CUDA profiler")
except Exception as e:
logger.warning_once("Failed to start CUDA profiler: %s", e)
def stop(self) -> None:
if self._profiler_running:
try:
self._cuda_profiler.stop()
logger.info_once("Stopped CUDA profiler")
except Exception as e:
logger.warning_once("Failed to stop CUDA profiler: %s", e)
finally:
self._profiler_running = False
def shutdown(self) -> None:
"""Ensure profiler is stopped when shutting down."""
self.stop()

View File

@@ -0,0 +1,392 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
from collections import defaultdict
from collections.abc import Callable
from dataclasses import asdict, dataclass, field
from typing import Any, Optional, TypeAlias
from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult
from torch._C._profiler import _EventType, _ExperimentalConfig, _ProfilerEvent
from torch.autograd.profiler import FunctionEvent
from torch.profiler import ProfilerActivity, profile
from vllm.profiler.utils import (
TablePrinter,
event_has_module,
event_is_torch_op,
event_module_repr,
event_torch_op_stack_trace,
indent_string,
)
from vllm.utils.import_utils import PlaceholderModule
try:
import pandas as pd
except ImportError:
pd = PlaceholderModule("pandas")
@dataclass
class _ModuleTreeNode:
event: _ProfilerEvent
parent: Optional["_ModuleTreeNode"] = None
children: list["_ModuleTreeNode"] = field(default_factory=list)
trace: str = ""
@property
def is_leaf(self):
return self.event.children is None or len(self.event.children) == 0
@property
def is_torch_op(self):
return event_is_torch_op(self.event)
@property
def is_cuda(self):
return (
self.event.tag == _EventType.Kineto
and self.event.typed[1].device_type == DeviceType.CUDA
)
@dataclass
class SummaryStatsEntry:
name: str
cuda_time_us: float
pct_cuda_time: float
invocations: int
@dataclass
class ModelStatsEntry:
name: str
cpu_time_us: float
cuda_time_us: float
pct_cuda_time: float
trace: str
StatsEntry: TypeAlias = ModelStatsEntry | SummaryStatsEntry
@dataclass
class _StatsTreeNode:
entry: StatsEntry
children: list[StatsEntry]
parent: StatsEntry | None
@dataclass
class LayerwiseProfileResults(profile):
_kineto_results: _ProfilerResult
_kineto_event_correlation_map: dict[int, list[_KinetoEvent]] = field(init=False)
_event_correlation_map: dict[int, list[FunctionEvent]] = field(init=False)
_module_tree: list[_ModuleTreeNode] = field(init=False)
_model_stats_tree: list[_StatsTreeNode] = field(init=False)
_summary_stats_tree: list[_StatsTreeNode] = field(init=False)
# profile metadata
num_running_seqs: int | None = None
def __post_init__(self):
self._build_correlation_map()
self._build_module_tree()
self._build_stats_trees()
def print_model_table(self, column_widths: dict[str, int] = None):
_column_widths = dict(
name=60, cpu_time_us=12, cuda_time_us=12, pct_cuda_time=12, trace=60
)
if column_widths:
_column_widths.update(**column_widths)
filtered_model_table = [
(depth, row)
for depth, row in self._flatten_stats_tree(self._model_stats_tree)
if row.cuda_time_us > 0 or row.cpu_time_us > 0
]
TablePrinter(ModelStatsEntry, _column_widths).print_table(
self._indent_row_names_based_on_depth(
filtered_model_table,
indent_style=lambda indent: "|" + "-" * indent + " ",
)
)
def print_summary_table(self, column_widths: dict[str, int] = None):
_column_widths = dict(
name=80, cuda_time_us=12, pct_cuda_time=12, invocations=15
)
if column_widths:
_column_widths.update(**column_widths)
filtered_summary_table = [
(depth, row)
for depth, row in self._flatten_stats_tree(self._summary_stats_tree)
if row.cuda_time_us > 0
]
TablePrinter(SummaryStatsEntry, _column_widths).print_table(
self._indent_row_names_based_on_depth(
filtered_summary_table,
indent_style=lambda indent: "|" + "-" * indent + " ",
)
)
def export_model_stats_table_csv(self, filename: str):
df = pd.DataFrame(
[asdict(row) for _, row in self._flatten_stats_tree(self._model_stats_tree)]
)
df.to_csv(filename)
def export_summary_stats_table_csv(self, filename: str):
df = pd.DataFrame(
[
asdict(row)
for _, row in self._flatten_stats_tree(self._summary_stats_tree)
]
)
df.to_csv(filename)
def convert_stats_to_dict(self) -> dict[str, Any]:
return {
"metadata": {"num_running_seqs": self.num_running_seqs},
"summary_stats": self._convert_stats_tree_to_dict(self._summary_stats_tree),
"model_stats": self._convert_stats_tree_to_dict(self._model_stats_tree),
}
@staticmethod
def _indent_row_names_based_on_depth(
depths_rows: list[tuple[int, StatsEntry]],
indent_style: Callable[[int], str] | str = " ",
):
indented_rows = []
for depth, row in depths_rows:
if row.cuda_time_us == 0:
continue
indented_row = copy.deepcopy(row)
indented_row.name = indent_string(indented_row.name, depth, indent_style)
indented_rows.append(indented_row)
return indented_rows
def _build_correlation_map(self):
self._kineto_event_correlation_map = defaultdict(list)
for event in self._kineto_results.events():
self._kineto_event_correlation_map[event.correlation_id()].append(event)
def _build_module_tree(self):
self._module_tree = []
event_tree = self._kineto_results.experimental_event_tree()
def _df_traversal(
event: _ProfilerEvent, curr_node: _ModuleTreeNode | None = None
):
# For the tensor parallel case for now only look at task 1
if event.start_tid != 1:
return
if event_has_module(event):
node = _ModuleTreeNode(event=event, parent=curr_node)
if curr_node:
curr_node.children.append(node)
else:
self._module_tree.append(node)
curr_node = node
is_leaf = event.children is None or len(event.children) == 0
if is_leaf and curr_node:
node = _ModuleTreeNode(
event=event,
parent=curr_node,
trace=event_torch_op_stack_trace(
event, until=lambda x: event_has_module(x)
),
)
curr_node.children.append(node)
curr_node = node
for child in event.children:
_df_traversal(child, curr_node)
for root in event_tree:
_df_traversal(root)
def _get_kineto_gpu_event(self, node: _ModuleTreeNode):
if node.event.tag != _EventType.Kineto:
return None
correlated_kineto_events = self._kineto_event_correlation_map.get(
node.event.correlation_id, []
)
iterator = (
x
for x in correlated_kineto_events
if x.device_type() == DeviceType.CUDA and x.name() == node.event.name
)
return next(iterator, None)
def _cumulative_cuda_time(self, node: _ModuleTreeNode):
"Return cuda time in microseconds"
def _cumulative_cuda_time_recursive(node: _ModuleTreeNode):
if node.is_leaf and (gpu_kineto_event := self._get_kineto_gpu_event(node)):
return gpu_kineto_event.duration_ns() / 1000.0
else:
cumulative_cuda_time = 0
for child in node.children:
cumulative_cuda_time += _cumulative_cuda_time_recursive(child)
return cumulative_cuda_time
return _cumulative_cuda_time_recursive(node)
def _total_cuda_time(self):
return sum([self._cumulative_cuda_time(root) for root in self._module_tree])
def _build_stats_trees(self):
summary_dict: dict[str, _StatsTreeNode] = {}
total_cuda_time = self._total_cuda_time()
def pct_cuda_time(cuda_time_us):
return (cuda_time_us / total_cuda_time) * 100
def build_summary_stats_tree_df(
node: _ModuleTreeNode,
parent: _StatsTreeNode | None = None,
summary_trace: tuple[str] = (),
):
if event_has_module(node.event):
name = event_module_repr(node.event)
cuda_time_us = self._cumulative_cuda_time(node)
elif gpu_kineto_event := self._get_kineto_gpu_event(node):
name = gpu_kineto_event.name()
cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
else:
return None
summary_trace = summary_trace + (name,)
if summary_trace in summary_dict:
entry = summary_dict[summary_trace].entry
entry.cuda_time_us += cuda_time_us
entry.invocations += 1
entry.pct_cuda_time = pct_cuda_time(entry.cuda_time_us)
else:
new_node = _StatsTreeNode(
entry=SummaryStatsEntry(
name=name,
cuda_time_us=cuda_time_us,
pct_cuda_time=pct_cuda_time(cuda_time_us),
invocations=1,
),
children=[],
parent=parent,
)
if parent:
parent.children.append(new_node)
summary_dict[summary_trace] = new_node
for child in node.children:
build_summary_stats_tree_df(
child, summary_dict[summary_trace], summary_trace
)
return summary_dict[summary_trace]
self._summary_stats_tree = []
for root in self._module_tree:
self._summary_stats_tree.append(build_summary_stats_tree_df(root))
def build_model_stats_tree_df(
node: _ModuleTreeNode, parent: _StatsTreeNode | None = None
):
if event_has_module(
node.event,
):
name = event_module_repr(node.event)
cuda_time_us = self._cumulative_cuda_time(node)
cpu_time_us = node.event.duration_time_ns / 1000
trace = ""
elif gpu_kineto_event := self._get_kineto_gpu_event(node):
name = gpu_kineto_event.name()
cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
cpu_time_us = 0
trace = node.trace
else:
return None
new_node = _StatsTreeNode(
entry=ModelStatsEntry(
name=name,
cpu_time_us=cpu_time_us,
cuda_time_us=cuda_time_us,
pct_cuda_time=pct_cuda_time(cuda_time_us),
trace=trace,
),
parent=parent,
children=[],
)
if parent:
parent.children.append(new_node)
for child in node.children:
build_model_stats_tree_df(child, new_node)
return new_node
self._model_stats_tree = []
for root in self._module_tree:
self._model_stats_tree.append(build_model_stats_tree_df(root))
def _flatten_stats_tree(
self, tree: list[_StatsTreeNode]
) -> list[tuple[int, StatsEntry]]:
entries: list[tuple[int, StatsEntry]] = []
def df_traversal(node: _StatsTreeNode, depth=0):
entries.append((depth, node.entry))
for child in node.children:
df_traversal(child, depth=depth + 1)
for root in tree:
df_traversal(root)
return entries
def _convert_stats_tree_to_dict(self, tree: list[_StatsTreeNode]) -> list[dict]:
root_dicts: list[dict] = []
def df_traversal(node: _StatsTreeNode, curr_json_list: list[dict]):
curr_json_list.append({"entry": asdict(node.entry), "children": []})
for child in node.children:
df_traversal(child, curr_json_list[-1]["children"])
for root in tree:
df_traversal(root, root_dicts)
return root_dicts
class layerwise_profile(profile):
def __init__(self, num_running_seqs: int | None = None):
"""
layerwise profile constructor.
Args:
num_running_seqs (Optional[int], optional): When given,
num_running_seqs will be passed to LayerProfileResults
for metadata update. Defaults to None.
"""
super().__init__(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
record_shapes=True,
with_stack=True,
with_modules=True,
experimental_config=_ExperimentalConfig(verbose=True),
)
self.num_running_seqs = num_running_seqs
def __enter__(self):
return super().__enter__()
def __exit__(self, exc_type, exc_val, exc_tb):
super().__exit__(exc_type, exc_val, exc_tb)
self.results = LayerwiseProfileResults(
self.profiler.kineto_results, num_running_seqs=self.num_running_seqs
)

151
profiler/utils.py Normal file
View File

@@ -0,0 +1,151 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
from collections.abc import Callable
from torch._C._profiler import _EventType, _ProfilerEvent, _TensorMetadata
#
# String / Print Manipulation
#
def trim_string_front(string, width):
if len(string) > width:
offset = len(string) - width + 3
string = string[offset:]
if len(string) > 3:
string = "..." + string[3:]
return string
def trim_string_back(string, width):
if len(string) > width:
offset = len(string) - width + 3
string = string[:-offset]
if len(string) > 3:
string = string + "..."
return string
class TablePrinter:
def __init__(
self, row_cls: type[dataclasses.dataclass], column_widths: dict[str, int]
):
self.row_cls = row_cls
self.fieldnames = [x.name for x in dataclasses.fields(row_cls)]
self.column_widths = column_widths
assert set(self.column_widths.keys()) == set(self.fieldnames)
def print_table(self, rows: list[dataclasses.dataclass]):
self._print_header()
self._print_line()
for row in rows:
self._print_row(row)
def _print_header(self):
for i, f in enumerate(self.fieldnames):
last = i == len(self.fieldnames) - 1
col_width = self.column_widths[f]
print(
trim_string_back(f, col_width).ljust(col_width),
end=" | " if not last else "\n",
)
def _print_row(self, row):
assert isinstance(row, self.row_cls)
for i, f in enumerate(self.fieldnames):
last = i == len(self.fieldnames) - 1
col_width = self.column_widths[f]
val = getattr(row, f)
val_str = ""
if isinstance(val, str):
val_str = trim_string_back(val, col_width).ljust(col_width)
elif type(val) in [float, int]:
val_str = f"{float(val):>.2f}".rjust(col_width)
else:
val_str = f"{val}".rjust(col_width)
print(val_str, end=" | " if not last else "\n")
def _print_line(self):
total_col_width = 0
for column_width in self.column_widths.values():
total_col_width += column_width
print("=" * (total_col_width + 3 * (len(self.column_widths) - 1)))
def indent_string(
string: str, indent: int, indent_style: Callable[[int], str] | str = " "
) -> str:
if indent:
if isinstance(indent_style, str):
return indent_style * indent + string
else:
return indent_style(indent) + string
else:
return string
#
# _ProfilerEvent utils
#
def event_has_module(event: _ProfilerEvent) -> bool:
event_type, typed_event = event.typed
if event_type == _EventType.PyCall:
return typed_event.module is not None
return False
def event_is_torch_op(event: _ProfilerEvent) -> bool:
return event.tag == _EventType.TorchOp
def event_arg_repr(arg) -> str:
if arg is None or type(arg) in [float, int, bool, str]:
return f"{arg}"
elif isinstance(arg, list):
return f"[{', '.join([event_arg_repr(x) for x in arg])}]"
elif isinstance(arg, tuple):
return f"({', '.join([event_arg_repr(x) for x in arg])})"
else:
assert isinstance(arg, _TensorMetadata), f"Unsupported type: {type(arg)}"
sizes_str = ", ".join([str(x) for x in arg.sizes])
return f"{str(arg.dtype).replace('torch.', '')}[{sizes_str}]"
def event_torch_op_repr(event: _ProfilerEvent) -> str:
assert event.tag == _EventType.TorchOp
args_str = ", ".join([event_arg_repr(x) for x in event.typed[1].inputs])
return f"{event.name}({args_str})".replace("aten::", "")
def event_module_repr(event: _ProfilerEvent) -> str:
assert event_has_module(event)
module = event.typed[1].module
if module.parameters and len(module.parameters) > 0:
args_str = ", ".join(
[f"{x[0]}={event_arg_repr(x[1])}" for x in module.parameters]
)
return f"{module.cls_name}({args_str})"
else:
return module.cls_name
def event_torch_op_stack_trace(
curr_event: _ProfilerEvent, until: Callable[[_ProfilerEvent], bool]
) -> str:
trace = ""
curr_event = curr_event.parent
while curr_event and not until(curr_event):
if event_is_torch_op(curr_event):
if len(trace) > 0:
trace += " <- "
trace += event_torch_op_repr(curr_event)
curr_event = curr_event.parent
return trace