[CI] Upgrade vLLM version (#3139)

Upgrade vLLM version to the newest commit. - Fix the break change introduced by 969b4da3a6 - Add a patch to quick fix torhcair de94289a98 - fix the ut error introduced by de94289a98 Close: https://github.com/vllm-project/vllm-ascend/issues/3138 - vLLM version: v0.10.2 - vLLM main: f225ea7dd9 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: MengqingCao <cmq0113@163.com>
2025-09-25 07:36:51 +08:00
parent 464270e4ca
commit a055183821
9 changed files with 105 additions and 15 deletions
--- a/vllm_ascend/patch/worker/patch_common/init.py
+++ b/vllm_ascend/patch/worker/patch_common/init.py
@@ -22,6 +22,7 @@ if HAS_TRITON:

 import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_logits  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_weight_loader  # noqa

 # TODO: revert me when triton import is fixed
 # import vllm_ascend.patch.worker.patch_common.patch_minicpm  # noqa
--- a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
@@ -0,0 +1,60 @@
+import torch
+from torch.nn.parameter import Parameter
+from vllm.logger import init_logger
+# yapf: disable
+from vllm.model_executor.parameter import ModelWeightParameter
+# yapf: enable
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.utils import GiB_bytes
+
+from vllm_ascend.utils import vllm_version_is
+
+logger = init_logger(__name__)
+
+
+def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
+                   output_partition_sizes: list[int], input_size: int,
+                   output_size: int, params_dtype: torch.dtype,
+                   **extra_weight_attrs):
+    from vllm_ascend.ascend_config import get_ascend_config
+    ascend_config = get_ascend_config()
+    # This method creates unquantized linear weights.
+    # The weights are not quantized, and they are not sharded.
+    # The amount of memory allocated for the weights is
+    # sum(output_partition_sizes) * input_size_per_partition.
+    try:
+        if ascend_config.torchair_graph_config.enabled:
+            weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                           input_size_per_partition,
+                                           dtype=params_dtype),
+                               requires_grad=False)
+        else:
+            weight_loader = extra_weight_attrs.pop("weight_loader")
+            weight = ModelWeightParameter(data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype),
+                                          input_dim=1,
+                                          output_dim=0,
+                                          weight_loader=weight_loader)
+    except torch.cuda.OutOfMemoryError as e:
+        logger.error("Failed to create unquantized linear weights: %s", e)
+        if torch.cuda.is_available():
+            logger.debug("CUDA device: %s", torch.cuda.current_device())
+            logger.debug("Allocated: %.2f GiB",
+                         torch.cuda.memory_allocated() / GiB_bytes)
+            logger.debug("Reserved: %.2f GiB",
+                         torch.cuda.memory_reserved() / GiB_bytes)
+        raise RuntimeError(
+            "Failed to create unquantized linear weights. "
+            "This may be caused by insufficient memory to allocate "
+            "the weight.") from e
+    if ascend_config.torchair_graph_config.enabled:
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+    layer.register_parameter("weight", weight)
+    set_weight_attrs(weight, extra_weight_attrs)
+
+
+if not vllm_version_is("0.10.2"):
+    from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+    UnquantizedLinearMethod.create_weights = create_weights