This commit is contained in:
root
2026-04-09 11:23:47 +08:00
parent 8082d5f4b2
commit 72387e4fa8
1885 changed files with 611521 additions and 1 deletions

View File

View File

@@ -0,0 +1,400 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
from collections import defaultdict
from collections.abc import Callable
from dataclasses import asdict, dataclass, field
from typing import Any, Generic, TypeAlias, TypeVar
from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult
from torch._C._profiler import _EventType, _ExperimentalConfig, _ProfilerEvent
from torch.autograd.profiler import FunctionEvent
from torch.profiler import ProfilerActivity, profile
from vllm.profiler.utils import (
TablePrinter,
event_has_module,
event_is_torch_op,
event_module_repr,
event_torch_op_stack_trace,
indent_string,
)
from vllm.utils.import_utils import PlaceholderModule
try:
import pandas as pd
except ImportError:
pd = PlaceholderModule("pandas")
@dataclass
class _ModuleTreeNode:
event: _ProfilerEvent
parent: "_ModuleTreeNode | None" = None
children: list["_ModuleTreeNode"] = field(default_factory=list)
trace: str = ""
@property
def is_leaf(self):
return self.event.children is None or len(self.event.children) == 0
@property
def is_torch_op(self):
return event_is_torch_op(self.event)
@property
def is_cuda(self):
return (
self.event.tag == _EventType.Kineto
and self.event.typed[1].device_type == DeviceType.CUDA
)
@dataclass
class SummaryStatsEntry:
name: str
cuda_time_us: float
pct_cuda_time: float
invocations: int
@dataclass
class ModelStatsEntry:
name: str
cpu_time_us: float
cuda_time_us: float
pct_cuda_time: float
trace: str
StatsEntry: TypeAlias = ModelStatsEntry | SummaryStatsEntry
StatsEntryT = TypeVar("StatsEntryT", bound=StatsEntry)
@dataclass
class _StatsTreeNode(Generic[StatsEntryT]):
entry: StatsEntryT
children: list["_StatsTreeNode[StatsEntryT]"] = field(default_factory=list)
parent: "_StatsTreeNode[StatsEntryT] | None" = None
@dataclass
class LayerwiseProfileResults(profile):
_kineto_results: _ProfilerResult
_kineto_event_correlation_map: dict[int, list[_KinetoEvent]] = field(init=False)
_event_correlation_map: dict[int, list[FunctionEvent]] = field(init=False)
_module_tree: list[_ModuleTreeNode] = field(init=False)
_model_stats_tree: list[_StatsTreeNode[ModelStatsEntry]] = field(init=False)
_summary_stats_tree: list[_StatsTreeNode[SummaryStatsEntry]] = field(init=False)
# profile metadata
num_running_seqs: int | None = None
def __post_init__(self):
self._build_correlation_map()
self._build_module_tree()
self._build_stats_trees()
def print_model_table(self, column_widths: dict[str, int] | None = None):
_column_widths = dict(
name=60, cpu_time_us=12, cuda_time_us=12, pct_cuda_time=12, trace=60
)
if column_widths:
_column_widths.update(**column_widths)
filtered_model_table = [
(depth, row)
for depth, row in self._flatten_stats_tree(self._model_stats_tree)
if row.cuda_time_us > 0 or row.cpu_time_us > 0
]
TablePrinter(ModelStatsEntry, _column_widths).print_table(
self._indent_row_names_based_on_depth(
filtered_model_table,
indent_style=lambda indent: "|" + "-" * indent + " ",
)
)
def print_summary_table(self, column_widths: dict[str, int] | None = None):
_column_widths = dict(
name=80, cuda_time_us=12, pct_cuda_time=12, invocations=15
)
if column_widths:
_column_widths.update(**column_widths)
filtered_summary_table = [
(depth, row)
for depth, row in self._flatten_stats_tree(self._summary_stats_tree)
if row.cuda_time_us > 0
]
TablePrinter(SummaryStatsEntry, _column_widths).print_table(
self._indent_row_names_based_on_depth(
filtered_summary_table,
indent_style=lambda indent: "|" + "-" * indent + " ",
)
)
def export_model_stats_table_csv(self, filename: str):
df = pd.DataFrame(
[asdict(row) for _, row in self._flatten_stats_tree(self._model_stats_tree)]
)
df.to_csv(filename)
def export_summary_stats_table_csv(self, filename: str):
df = pd.DataFrame(
[
asdict(row)
for _, row in self._flatten_stats_tree(self._summary_stats_tree)
]
)
df.to_csv(filename)
def convert_stats_to_dict(self) -> dict[str, Any]:
return {
"metadata": {"num_running_seqs": self.num_running_seqs},
"summary_stats": self._convert_stats_tree_to_dict(self._summary_stats_tree),
"model_stats": self._convert_stats_tree_to_dict(self._model_stats_tree),
}
@staticmethod
def _indent_row_names_based_on_depth(
depths_rows: list[tuple[int, StatsEntryT]],
indent_style: Callable[[int], str] | str = " ",
):
indented_rows: list[StatsEntryT] = []
for depth, row in depths_rows:
if row.cuda_time_us == 0:
continue
indented_row: StatsEntryT = copy.deepcopy(row)
indented_row.name = indent_string(indented_row.name, depth, indent_style)
indented_rows.append(indented_row)
return indented_rows
def _build_correlation_map(self):
self._kineto_event_correlation_map = defaultdict(list)
for event in self._kineto_results.events():
self._kineto_event_correlation_map[event.correlation_id()].append(event)
def _build_module_tree(self):
self._module_tree = []
event_tree = self._kineto_results.experimental_event_tree()
def _df_traversal(
event: _ProfilerEvent, curr_node: _ModuleTreeNode | None = None
):
# For the tensor parallel case for now only look at task 1
if event.start_tid != 1:
return
if event_has_module(event):
node = _ModuleTreeNode(event=event, parent=curr_node)
if curr_node:
curr_node.children.append(node)
else:
self._module_tree.append(node)
curr_node = node
is_leaf = event.children is None or len(event.children) == 0
if is_leaf and curr_node:
node = _ModuleTreeNode(
event=event,
parent=curr_node,
trace=event_torch_op_stack_trace(
event, until=lambda x: event_has_module(x)
),
)
curr_node.children.append(node)
curr_node = node
for child in event.children:
_df_traversal(child, curr_node)
for root in event_tree:
_df_traversal(root)
def _get_kineto_gpu_event(self, node: _ModuleTreeNode):
if node.event.tag != _EventType.Kineto:
return None
correlated_kineto_events = self._kineto_event_correlation_map.get(
node.event.correlation_id, []
)
iterator = (
x
for x in correlated_kineto_events
if x.device_type() == DeviceType.CUDA and x.name() == node.event.name
)
return next(iterator, None)
def _cumulative_cuda_time(self, node: _ModuleTreeNode):
"Return cuda time in microseconds"
def _cumulative_cuda_time_recursive(node: _ModuleTreeNode):
if node.is_leaf and (gpu_kineto_event := self._get_kineto_gpu_event(node)):
return gpu_kineto_event.duration_ns() / 1000.0
else:
cumulative_cuda_time = 0
for child in node.children:
cumulative_cuda_time += _cumulative_cuda_time_recursive(child)
return cumulative_cuda_time
return _cumulative_cuda_time_recursive(node)
def _total_cuda_time(self):
return sum([self._cumulative_cuda_time(root) for root in self._module_tree])
def _build_stats_trees(self):
summary_dict: dict[tuple[str, ...], _StatsTreeNode[SummaryStatsEntry]] = {}
total_cuda_time = self._total_cuda_time()
def pct_cuda_time(cuda_time_us):
return (cuda_time_us / total_cuda_time) * 100
def build_summary_stats_tree_df(
node: _ModuleTreeNode,
parent: _StatsTreeNode[SummaryStatsEntry] | None = None,
summary_trace: tuple[str, ...] = (),
) -> _StatsTreeNode[SummaryStatsEntry] | None:
if event_has_module(node.event):
name = event_module_repr(node.event)
cuda_time_us = self._cumulative_cuda_time(node)
elif gpu_kineto_event := self._get_kineto_gpu_event(node):
name = gpu_kineto_event.name()
cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
else:
return None
summary_trace = summary_trace + (name,)
if summary_trace in summary_dict:
entry = summary_dict[summary_trace].entry
entry.cuda_time_us += cuda_time_us
entry.invocations += 1
entry.pct_cuda_time = pct_cuda_time(entry.cuda_time_us)
else:
new_node = _StatsTreeNode(
entry=SummaryStatsEntry(
name=name,
cuda_time_us=cuda_time_us,
pct_cuda_time=pct_cuda_time(cuda_time_us),
invocations=1,
),
parent=parent,
)
if parent:
parent.children.append(new_node)
summary_dict[summary_trace] = new_node
for child in node.children:
build_summary_stats_tree_df(
child, summary_dict[summary_trace], summary_trace
)
return summary_dict[summary_trace]
self._summary_stats_tree = []
for root in self._module_tree:
summary_node = build_summary_stats_tree_df(root)
if summary_node is not None:
self._summary_stats_tree.append(summary_node)
def build_model_stats_tree_df(
node: _ModuleTreeNode,
parent: _StatsTreeNode[ModelStatsEntry] | None = None,
) -> _StatsTreeNode[ModelStatsEntry] | None:
if event_has_module(
node.event,
):
name = event_module_repr(node.event)
cuda_time_us = self._cumulative_cuda_time(node)
cpu_time_us = node.event.duration_time_ns / 1000
trace = ""
elif gpu_kineto_event := self._get_kineto_gpu_event(node):
name = gpu_kineto_event.name()
cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
cpu_time_us = 0
trace = node.trace
else:
return None
new_node = _StatsTreeNode(
entry=ModelStatsEntry(
name=name,
cpu_time_us=cpu_time_us,
cuda_time_us=cuda_time_us,
pct_cuda_time=pct_cuda_time(cuda_time_us),
trace=trace,
),
parent=parent,
)
if parent:
parent.children.append(new_node)
for child in node.children:
build_model_stats_tree_df(child, new_node)
return new_node
self._model_stats_tree = []
for root in self._module_tree:
model_node = build_model_stats_tree_df(root)
if model_node is not None:
self._model_stats_tree.append(model_node)
def _flatten_stats_tree(
self, tree: list[_StatsTreeNode[StatsEntryT]]
) -> list[tuple[int, StatsEntryT]]:
entries: list[tuple[int, StatsEntryT]] = []
def df_traversal(node: _StatsTreeNode[StatsEntryT], depth: int = 0):
entries.append((depth, node.entry))
for child in node.children:
df_traversal(child, depth=depth + 1)
for root in tree:
df_traversal(root)
return entries
def _convert_stats_tree_to_dict(
self, tree: list[_StatsTreeNode[StatsEntryT]]
) -> list[dict[str, Any]]:
root_dicts: list[dict[str, Any]] = []
def df_traversal(
node: _StatsTreeNode[StatsEntryT], curr_json_list: list[dict[str, Any]]
):
curr_json_list.append({"entry": asdict(node.entry), "children": []})
for child in node.children:
df_traversal(child, curr_json_list[-1]["children"])
for root in tree:
df_traversal(root, root_dicts)
return root_dicts
class layerwise_profile(profile):
def __init__(self, num_running_seqs: int | None = None):
"""
layerwise profile constructor.
Args:
num_running_seqs (Optional[int], optional): When given,
num_running_seqs will be passed to LayerProfileResults
for metadata update. Defaults to None.
"""
super().__init__(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
record_shapes=True,
with_stack=True,
with_modules=True,
experimental_config=_ExperimentalConfig(verbose=True),
)
self.num_running_seqs = num_running_seqs
def __enter__(self):
return super().__enter__()
def __exit__(self, exc_type, exc_val, exc_tb):
super().__exit__(exc_type, exc_val, exc_tb)
self.results = LayerwiseProfileResults(
self.profiler.kineto_results, num_running_seqs=self.num_running_seqs
)

150
vllm/profiler/utils.py Normal file
View File

@@ -0,0 +1,150 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
from collections.abc import Callable
from _typeshed import DataclassInstance
from torch._C._profiler import _EventType, _ProfilerEvent, _TensorMetadata
#
# String / Print Manipulation
#
def trim_string_front(string: str, width: int) -> str:
if len(string) > width:
offset = len(string) - width + 3
string = string[offset:]
if len(string) > 3:
string = "..." + string[3:]
return string
def trim_string_back(string: str, width: int) -> str:
if len(string) > width:
offset = len(string) - width + 3
string = string[:-offset]
if len(string) > 3:
string = string + "..."
return string
class TablePrinter:
def __init__(self, row_cls: type[DataclassInstance], column_widths: dict[str, int]):
self.row_cls = row_cls
self.fieldnames = [x.name for x in dataclasses.fields(row_cls)]
self.column_widths = column_widths
assert set(self.column_widths.keys()) == set(self.fieldnames)
def print_table(self, rows: list[DataclassInstance]):
self._print_header()
self._print_line()
for row in rows:
self._print_row(row)
def _print_header(self):
for i, f in enumerate(self.fieldnames):
last = i == len(self.fieldnames) - 1
col_width = self.column_widths[f]
print(
trim_string_back(f, col_width).ljust(col_width),
end=" | " if not last else "\n",
)
def _print_row(self, row):
assert isinstance(row, self.row_cls)
for i, f in enumerate(self.fieldnames):
last = i == len(self.fieldnames) - 1
col_width = self.column_widths[f]
val = getattr(row, f)
val_str = ""
if isinstance(val, str):
val_str = trim_string_back(val, col_width).ljust(col_width)
elif type(val) in [float, int]:
val_str = f"{float(val):>.2f}".rjust(col_width)
else:
val_str = f"{val}".rjust(col_width)
print(val_str, end=" | " if not last else "\n")
def _print_line(self):
total_col_width = 0
for column_width in self.column_widths.values():
total_col_width += column_width
print("=" * (total_col_width + 3 * (len(self.column_widths) - 1)))
def indent_string(
string: str, indent: int, indent_style: Callable[[int], str] | str = " "
) -> str:
if indent:
if isinstance(indent_style, str):
return indent_style * indent + string
else:
return indent_style(indent) + string
else:
return string
#
# _ProfilerEvent utils
#
def event_has_module(event: _ProfilerEvent) -> bool:
event_type, typed_event = event.typed
if event_type == _EventType.PyCall:
return typed_event.module is not None
return False
def event_is_torch_op(event: _ProfilerEvent) -> bool:
return event.tag == _EventType.TorchOp
def event_arg_repr(arg) -> str:
if arg is None or type(arg) in [float, int, bool, str]:
return f"{arg}"
elif isinstance(arg, list):
return f"[{', '.join([event_arg_repr(x) for x in arg])}]"
elif isinstance(arg, tuple):
return f"({', '.join([event_arg_repr(x) for x in arg])})"
else:
assert isinstance(arg, _TensorMetadata), f"Unsupported type: {type(arg)}"
sizes_str = ", ".join([str(x) for x in arg.sizes])
return f"{str(arg.dtype).replace('torch.', '')}[{sizes_str}]"
def event_torch_op_repr(event: _ProfilerEvent) -> str:
assert event.tag == _EventType.TorchOp
args_str = ", ".join([event_arg_repr(x) for x in event.typed[1].inputs])
return f"{event.name}({args_str})".replace("aten::", "")
def event_module_repr(event: _ProfilerEvent) -> str:
assert event_has_module(event)
module = event.typed[1].module
if module.parameters and len(module.parameters) > 0:
args_str = ", ".join(
[f"{x[0]}={event_arg_repr(x[1])}" for x in module.parameters]
)
return f"{module.cls_name}({args_str})"
else:
return module.cls_name
def event_torch_op_stack_trace(
curr_event: _ProfilerEvent, until: Callable[[_ProfilerEvent], bool]
) -> str:
trace = ""
curr_event = curr_event.parent
while curr_event and not until(curr_event):
if event_is_torch_op(curr_event):
if len(trace) > 0:
trace += " <- "
trace += event_torch_op_repr(curr_event)
curr_event = curr_event.parent
return trace

254
vllm/profiler/wrapper.py Normal file
View File

@@ -0,0 +1,254 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from collections.abc import Callable
from contextlib import nullcontext
from typing import Literal
import torch
from typing_extensions import override
from vllm.config import ProfilerConfig
from vllm.config.profiler import _is_uri_path
from vllm.logger import init_logger
logger = init_logger(__name__)
class WorkerProfiler(ABC):
def __init__(self, profiler_config: ProfilerConfig) -> None:
self._delay_iters = profiler_config.delay_iterations
if self._delay_iters > 0:
logger.info_once(
"GPU profiling will start "
f"{self._delay_iters} steps after start_profile."
)
self._max_iters = profiler_config.max_iterations
if self._max_iters > 0:
logger.info_once(
"GPU profiling will stop "
f"after {self._max_iters} worker steps, "
"or when stop_profile is received."
)
# Track when the profiler gets triggered by start_profile
self._active_iteration_count = 0
self._active = False
# Track when the profiler is actually running
self._profiling_for_iters = 0
self._running = False
@abstractmethod
def _start(self) -> None:
"""Start the profiler."""
pass
@abstractmethod
def _stop(self) -> None:
"""Stop the profiler."""
pass
def _call_start(self) -> None:
"""Call _start with error handling but no safeguards."""
try:
self._start()
self._running = True # Only mark as running if start succeeds
except Exception as e:
logger.warning("Failed to start profiler: %s", e)
def _call_stop(self) -> None:
"""Call _stop with error handling but no safeguards."""
try:
self._stop()
logger.info_once("Profiler stopped successfully.", scope="local")
except Exception as e:
logger.warning("Failed to stop profiler: %s", e)
self._running = False # Always mark as not running, assume stop worked
def start(self) -> None:
"""Attempt to start the profiler, accounting for delayed starts."""
if self._active:
logger.debug(
"start_profile received when profiler is already active. "
"Ignoring request."
)
return
self._active = True
if self._delay_iters == 0:
self._call_start()
def step(self) -> None:
"""Update the profiler state at each worker step,
to handle delayed starts and max iteration limits."""
if not self._active:
return
self._active_iteration_count += 1
if (
not self._running
and self._delay_iters > 0
and self._active_iteration_count == self._delay_iters
):
logger.info_once("Starting profiler after delay...", scope="local")
self._call_start()
if self._running:
self._profiling_for_iters += 1
if (
self._max_iters > 0
and self._running
and self._profiling_for_iters > self._max_iters
):
# Automatically stop the profiler after max iters
# will be marked as not running, but leave as active so that stop
# can clean up properly
logger.info_once(
"Max profiling iterations reached. Stopping profiler...", scope="local"
)
self._call_stop()
return
def stop(self) -> None:
"""Attempt to stop the profiler, accounting for overlapped calls."""
if not self._active:
logger.debug(
"stop_profile received when profiler is not active. Ignoring request."
)
return
self._active = False
self._active_iteration_count = 0
self._profiling_for_iters = 0
if self._running:
self._call_stop()
def shutdown(self) -> None:
"""Ensure profiler is stopped when shutting down."""
logger.info_once("Shutting down profiler", scope="local")
if self._running:
self.stop()
def annotate_context_manager(self, name: str):
"""Return a context manager to annotate profiler traces."""
return nullcontext()
TorchProfilerActivity = Literal["CPU", "CUDA", "XPU"]
TorchProfilerActivityMap = {
"CPU": torch.profiler.ProfilerActivity.CPU,
"CUDA": torch.profiler.ProfilerActivity.CUDA,
"XPU": torch.profiler.ProfilerActivity.XPU,
}
class TorchProfilerWrapper(WorkerProfiler):
def __init__(
self,
profiler_config: ProfilerConfig,
worker_name: str,
local_rank: int,
activities: list[TorchProfilerActivity],
on_trace_ready: Callable[[torch.profiler.profile], None] | None = None,
) -> None:
super().__init__(profiler_config)
self.local_rank = local_rank
self.profiler_config = profiler_config
torch_profiler_trace_dir = profiler_config.torch_profiler_dir
if local_rank in (None, 0):
logger.info_once(
"Torch profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir,
scope="local",
)
logger.debug(
"Profiler config: record_shapes=%s,"
"profile_memory=%s,with_stack=%s,with_flops=%s",
profiler_config.torch_profiler_record_shapes,
profiler_config.torch_profiler_with_memory,
profiler_config.torch_profiler_with_stack,
profiler_config.torch_profiler_with_flops,
)
# Determine trace handler: use custom handler if provided,
# otherwise default to tensorboard trace handler
if on_trace_ready is not None:
trace_handler = on_trace_ready
else:
trace_handler = torch.profiler.tensorboard_trace_handler(
torch_profiler_trace_dir,
worker_name=worker_name,
use_gzip=profiler_config.torch_profiler_use_gzip,
)
self.dump_cpu_time_total = "CPU" in activities and len(activities) == 1
self.profiler = torch.profiler.profile(
activities=[TorchProfilerActivityMap[activity] for activity in activities],
record_shapes=profiler_config.torch_profiler_record_shapes,
profile_memory=profiler_config.torch_profiler_with_memory,
with_stack=profiler_config.torch_profiler_with_stack,
with_flops=profiler_config.torch_profiler_with_flops,
on_trace_ready=trace_handler,
)
@override
def _start(self) -> None:
self.profiler.start()
@override
def _stop(self) -> None:
self.profiler.stop()
profiler_config = self.profiler_config
rank = self.local_rank
if profiler_config.torch_profiler_dump_cuda_time_total:
profiler_dir = profiler_config.torch_profiler_dir
sort_key = "self_cuda_time_total"
table = self.profiler.key_averages().table(sort_by=sort_key)
# Skip file write for URI paths (gs://, s3://, etc.)
# as standard file I/O doesn't work with URI schemes
if not _is_uri_path(profiler_dir):
profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
with open(profiler_out_file, "w") as f:
print(table, file=f)
# only print profiler results on rank 0
if rank == 0:
print(table)
if self.dump_cpu_time_total and rank == 0:
logger.info(
self.profiler.key_averages().table(
sort_by="self_cpu_time_total", row_limit=50
)
)
@override
def annotate_context_manager(self, name: str):
return torch.profiler.record_function(name)
class CudaProfilerWrapper(WorkerProfiler):
def __init__(self, profiler_config: ProfilerConfig) -> None:
super().__init__(profiler_config)
# Note: lazy import to avoid dependency issues if CUDA is not available.
import torch.cuda.profiler as cuda_profiler
self._cuda_profiler = cuda_profiler
@override
def _start(self) -> None:
self._cuda_profiler.start()
@override
def _stop(self) -> None:
self._cuda_profiler.stop()
@override
def annotate_context_manager(self, name: str):
return torch.cuda.nvtx.range(name)