Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -1,24 +1,24 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utils for model executor."""
import random
from typing import Any, Dict, Optional
import numpy as np
import copy
from typing import Any
import torch
from vllm.utils.torch_utils import is_torch_equal_or_newer
def set_random_seed(seed: int) -> None:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
elif torch.musa.is_available():
torch.musa.manual_seed_all(seed)
def set_random_seed(seed: int | None) -> None:
from vllm.platforms import current_platform
current_platform.seed_everything(seed)
def set_weight_attrs(
weight: torch.Tensor,
weight_attrs: Optional[Dict[str, Any]],
weight_attrs: dict[str, Any] | None,
):
"""Set attributes on a weight tensor.
@@ -32,6 +32,88 @@ def set_weight_attrs(
if weight_attrs is None:
return
for key, value in weight_attrs.items():
assert not hasattr(
weight, key), (f"Overwriting existing tensor attribute: {key}")
assert not hasattr(weight, key), f"Overwriting existing tensor attribute: {key}"
# NOTE(woosuk): During weight loading, we often do something like:
# narrowed_tensor = param.data.narrow(0, offset, len)
# narrowed_tensor.copy_(real_weight)
# expecting narrowed_tensor and param.data to share the same storage.
# However, on TPUs, narrowed_tensor will lazily propagate to the base
# tensor, which is param.data, leading to the redundant memory usage.
# This sometimes causes OOM errors during model loading. To avoid this,
# we sync the param tensor after its weight loader is called.
# TODO(woosuk): Remove this hack once we have a better solution.
from vllm.platforms import current_platform
if current_platform.use_sync_weight_loader() and key == "weight_loader":
value = current_platform.make_synced_weight_loader(value)
setattr(weight, key, value)
def replace_parameter(layer: torch.nn.Module, param_name: str, new_data: torch.Tensor):
"""
Replace a parameter of a layer while maintaining the ability to reload the weight.
Called within implementations of the `process_weights_after_loading` method.
This function should not be called on weights which are tied/shared
Args:
layer: Layer containing parameter to replace
param_name: Name of parameter to replace
new_data: New data of the new parameter
"""
# should not be used on a tied/shared param
if isinstance(new_data, torch.nn.Parameter):
new_data = new_data.data
new_param = torch.nn.Parameter(new_data, requires_grad=False)
old_param: torch.nn.Parameter | None = getattr(layer, param_name, None)
if old_param is not None and hasattr(old_param, "weight_loader"):
weight_loader = old_param.weight_loader
set_weight_attrs(new_param, {"weight_loader": weight_loader})
setattr(layer, param_name, new_param)
def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
parent_map = getattr(model, "packed_modules_mapping", None)
parent_map = copy.deepcopy(parent_map) if parent_map is not None else {}
# don't infer mapping if the model has defined it explicitly.
if parent_map:
return parent_map
# We only check main components instead of whole model submodules
for child in model.children():
child_map = getattr(child, "packed_modules_mapping", None)
child_map = copy.deepcopy(child_map) if child_map is not None else {}
if any((k in parent_map and parent_map[k] != v) for k, v in child_map.items()):
raise ValueError(
f"Can't update {type(model).__name__}'s packed_modules_mapping "
f"safely because of conflicts from {type(child).__name__}."
)
else:
parent_map.update(child_map)
return parent_map
def get_moe_expert_mapping(
model: torch.nn.Module,
) -> list[tuple[str, str, int, str]]:
if parent_map := getattr(model, "get_expert_mapping", None):
return parent_map()
else:
# We only check main components instead of whole model submodules
for child in model.children():
child_map = getattr(child, "get_expert_mapping", None)
if child_map is not None:
return child_map()
return []
def maybe_disable_graph_partition(current_backend: str) -> dict[str, bool]:
if current_backend == "inductor" and is_torch_equal_or_newer("2.9.0.dev"):
return {"graph_partition": False}
else:
return {}