Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -1,24 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utils for model executor."""
-import random
-from typing import Any, Dict, Optional

-import numpy as np
+import copy
+from typing import Any
+
 import torch

+from vllm.utils.torch_utils import is_torch_equal_or_newer

-def set_random_seed(seed: int) -> None:
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
-    elif torch.musa.is_available():
-        torch.musa.manual_seed_all(seed)
+
+def set_random_seed(seed: int | None) -> None:
+    from vllm.platforms import current_platform
+
+    current_platform.seed_everything(seed)


 def set_weight_attrs(
    weight: torch.Tensor,
-    weight_attrs: Optional[Dict[str, Any]],
+    weight_attrs: dict[str, Any] | None,
 ):
    """Set attributes on a weight tensor.

@@ -32,6 +32,88 @@ def set_weight_attrs(
    if weight_attrs is None:
        return
    for key, value in weight_attrs.items():
-        assert not hasattr(
-            weight, key), (f"Overwriting existing tensor attribute: {key}")
+        assert not hasattr(weight, key), f"Overwriting existing tensor attribute: {key}"
+
+        # NOTE(woosuk): During weight loading, we often do something like:
+        # narrowed_tensor = param.data.narrow(0, offset, len)
+        # narrowed_tensor.copy_(real_weight)
+        # expecting narrowed_tensor and param.data to share the same storage.
+        # However, on TPUs, narrowed_tensor will lazily propagate to the base
+        # tensor, which is param.data, leading to the redundant memory usage.
+        # This sometimes causes OOM errors during model loading. To avoid this,
+        # we sync the param tensor after its weight loader is called.
+        # TODO(woosuk): Remove this hack once we have a better solution.
+        from vllm.platforms import current_platform
+
+        if current_platform.use_sync_weight_loader() and key == "weight_loader":
+            value = current_platform.make_synced_weight_loader(value)
        setattr(weight, key, value)
+
+
+def replace_parameter(layer: torch.nn.Module, param_name: str, new_data: torch.Tensor):
+    """
+    Replace a parameter of a layer while maintaining the ability to reload the weight.
+    Called within implementations of the `process_weights_after_loading` method.
+
+    This function should not be called on weights which are tied/shared
+
+    Args:
+        layer: Layer containing parameter to replace
+        param_name: Name of parameter to replace
+        new_data: New data of the new parameter
+    """
+    # should not be used on a tied/shared param
+    if isinstance(new_data, torch.nn.Parameter):
+        new_data = new_data.data
+    new_param = torch.nn.Parameter(new_data, requires_grad=False)
+
+    old_param: torch.nn.Parameter | None = getattr(layer, param_name, None)
+    if old_param is not None and hasattr(old_param, "weight_loader"):
+        weight_loader = old_param.weight_loader
+        set_weight_attrs(new_param, {"weight_loader": weight_loader})
+
+    setattr(layer, param_name, new_param)
+
+
+def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
+    parent_map = getattr(model, "packed_modules_mapping", None)
+    parent_map = copy.deepcopy(parent_map) if parent_map is not None else {}
+
+    # don't infer mapping if the model has defined it explicitly.
+    if parent_map:
+        return parent_map
+
+    # We only check main components instead of whole model submodules
+    for child in model.children():
+        child_map = getattr(child, "packed_modules_mapping", None)
+        child_map = copy.deepcopy(child_map) if child_map is not None else {}
+
+        if any((k in parent_map and parent_map[k] != v) for k, v in child_map.items()):
+            raise ValueError(
+                f"Can't update {type(model).__name__}'s packed_modules_mapping "
+                f"safely because of conflicts from {type(child).__name__}."
+            )
+        else:
+            parent_map.update(child_map)
+    return parent_map
+
+
+def get_moe_expert_mapping(
+    model: torch.nn.Module,
+) -> list[tuple[str, str, int, str]]:
+    if parent_map := getattr(model, "get_expert_mapping", None):
+        return parent_map()
+    else:
+        # We only check main components instead of whole model submodules
+        for child in model.children():
+            child_map = getattr(child, "get_expert_mapping", None)
+            if child_map is not None:
+                return child_map()
+        return []
+
+
+def maybe_disable_graph_partition(current_backend: str) -> dict[str, bool]:
+    if current_backend == "inductor" and is_torch_equal_or_newer("2.9.0.dev"):
+        return {"graph_partition": False}
+    else:
+        return {}