[CI] Upgrade vllm to newest commit (#3182)

### What this PR does / why we need it? Upgrade vLLM to newest commit - Fix the aclgraph doesn't work problem, caused by 24fab45d96 - Fix PoolerOutput import error, caused by 755ed7b05b - Fix the aclgraph weight load error to keep the same with torchair fix. 4492e3a554 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? All test should pass - vLLM version: v0.10.2 - vLLM main: 52d0cb8458 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-09-26 06:18:15 +08:00
parent 0794f64a18
commit 2930e4a6bd
9 changed files with 49 additions and 53 deletions
--- a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
@@ -1,9 +1,6 @@
 import torch
 from torch.nn.parameter import Parameter
 from vllm.logger import init_logger
-# yapf: disable
-from vllm.model_executor.parameter import ModelWeightParameter
-# yapf: enable
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils import GiB_bytes

@@ -16,27 +13,15 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
                   output_partition_sizes: list[int], input_size: int,
                   output_size: int, params_dtype: torch.dtype,
                   **extra_weight_attrs):
-    from vllm_ascend.ascend_config import get_ascend_config
-    ascend_config = get_ascend_config()
    # This method creates unquantized linear weights.
    # The weights are not quantized, and they are not sharded.
    # The amount of memory allocated for the weights is
    # sum(output_partition_sizes) * input_size_per_partition.
    try:
-        if ascend_config.torchair_graph_config.enabled:
-            weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                           input_size_per_partition,
-                                           dtype=params_dtype),
-                               requires_grad=False)
-        else:
-            weight_loader = extra_weight_attrs.pop("weight_loader")
-            weight = ModelWeightParameter(data=torch.empty(
-                sum(output_partition_sizes),
-                input_size_per_partition,
-                dtype=params_dtype),
-                                          input_dim=1,
-                                          output_dim=0,
-                                          weight_loader=weight_loader)
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=params_dtype),
+                           requires_grad=False)
    except torch.cuda.OutOfMemoryError as e:
        logger.error("Failed to create unquantized linear weights: %s", e)
        if torch.cuda.is_available():
@@ -49,8 +34,7 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
            "Failed to create unquantized linear weights. "
            "This may be caused by insufficient memory to allocate "
            "the weight.") from e
-    if ascend_config.torchair_graph_config.enabled:
-        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+    set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
    layer.register_parameter("weight", weight)
    set_weight_attrs(weight, extra_weight_attrs)