From a055183821120f66b44b514b1795c51ec142df2e Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Thu, 25 Sep 2025 07:36:51 +0800
Subject: [PATCH] [CI] Upgrade vLLM version (#3139)

Upgrade vLLM version to the newest commit.
- Fix the break change introduced by
https://github.com/vllm-project/vllm/commit/969b4da3a6ab737f72cb33db502b4c0bb70d4139
- Add a patch to quick fix torhcair
https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170
- fix the ut error introduced by
https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170

Close: https://github.com/vllm-project/vllm-ascend/issues/3138


- vLLM version: v0.10.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/f225ea7dd98e9f29752e5c032cd4a8ee1d712f16

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: MengqingCao <cmq0113@163.com>
Co-authored-by: MengqingCao <cmq0113@163.com>
---
 .github/workflows/format_pr_body.yaml         |  2 +-
 .github/workflows/vllm_ascend_test.yaml       |  6 +-
 .github/workflows/vllm_ascend_test_full.yaml  |  2 +-
 tests/ut/models/test_qwen2_5_vl.py            |  4 ++
 tests/ut/ops/test_linear.py                   |  4 ++
 .../patch/worker/patch_common/__init__.py     |  1 +
 .../patch_common/patch_weight_loader.py       | 60 +++++++++++++++++++
 vllm_ascend/torchair/torchair_worker.py       | 14 +++++
 vllm_ascend/worker/model_runner_v1.py         | 27 +++++----
 9 files changed, 105 insertions(+), 15 deletions(-)
 create mode 100644 vllm_ascend/patch/worker/patch_common/patch_weight_loader.py

diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml
index 56d85b0..b053c51 100644
--- a/.github/workflows/format_pr_body.yaml
+++ b/.github/workflows/format_pr_body.yaml
@@ -36,7 +36,7 @@ jobs:
 
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=f225ea7dd98e9f29752e5c032cd4a8ee1d712f16
+          VLLM_COMMIT=b1068903fdca26cf6b4a1a51a32c3365ce3ac636
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
 
       - name: Checkout repository
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index a26538c..49052b5 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -42,7 +42,7 @@ jobs:
   lint:
     uses: ./.github/workflows/pre-commit.yml
     with:
-      vllm: f225ea7dd98e9f29752e5c032cd4a8ee1d712f16
+      vllm: b1068903fdca26cf6b4a1a51a32c3365ce3ac636
 
   changes:
     runs-on: ubuntu-latest
@@ -83,7 +83,7 @@ jobs:
         VLLM_USE_MODELSCOPE: True
     strategy:
       matrix:
-        vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2]
+        vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
     steps:
       - name: Install packages
         run: |
@@ -138,7 +138,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2]
+        vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml
index e5836b5..d5144cc 100644
--- a/.github/workflows/vllm_ascend_test_full.yaml
+++ b/.github/workflows/vllm_ascend_test_full.yaml
@@ -68,7 +68,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2]
+        vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/tests/ut/models/test_qwen2_5_vl.py b/tests/ut/models/test_qwen2_5_vl.py
index e9982f9..06fb07d 100644
--- a/tests/ut/models/test_qwen2_5_vl.py
+++ b/tests/ut/models/test_qwen2_5_vl.py
@@ -298,6 +298,10 @@ class TestAscendQwen2_5_VisionTransformer(PytestBase):
             "vllm_ascend.ops.linear_op.get_tp_group",
             return_value=mock_group,
         )
+        mocker.patch(
+            "vllm.distributed.parallel_state.get_tp_group",
+            return_value=mock_group,
+        )
 
         vision_transformer = AscendQwen2_5_VisionTransformer(
             vision_config,
diff --git a/tests/ut/ops/test_linear.py b/tests/ut/ops/test_linear.py
index 083416e..e22d7ca 100644
--- a/tests/ut/ops/test_linear.py
+++ b/tests/ut/ops/test_linear.py
@@ -33,6 +33,10 @@ class BaseLinearTest(unittest.TestCase):
                   return_value=self.mock_group),
             patch("vllm_ascend.ops.linear_op.get_tp_group",
                   return_value=self.mock_group),
+            patch(
+                "vllm.distributed.parallel_state.get_tp_group",
+                return_value=self.mock_group,
+            ),
             patch("vllm_ascend.utils.mlp_tp_enable", return_value=True),
             patch("vllm_ascend.utils.oproj_tp_enable", return_value=True)
         ]
diff --git a/vllm_ascend/patch/worker/patch_common/__init__.py b/vllm_ascend/patch/worker/patch_common/__init__.py
index 37407b4..baf5321 100644
--- a/vllm_ascend/patch/worker/patch_common/__init__.py
+++ b/vllm_ascend/patch/worker/patch_common/__init__.py
@@ -22,6 +22,7 @@ if HAS_TRITON:
 
 import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_logits  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_weight_loader  # noqa
 
 # TODO: revert me when triton import is fixed
 # import vllm_ascend.patch.worker.patch_common.patch_minicpm  # noqa
diff --git a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
new file mode 100644
index 0000000..4bbd6d3
--- /dev/null
+++ b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
@@ -0,0 +1,60 @@
+import torch
+from torch.nn.parameter import Parameter
+from vllm.logger import init_logger
+# yapf: disable
+from vllm.model_executor.parameter import ModelWeightParameter
+# yapf: enable
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.utils import GiB_bytes
+
+from vllm_ascend.utils import vllm_version_is
+
+logger = init_logger(__name__)
+
+
+def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
+                   output_partition_sizes: list[int], input_size: int,
+                   output_size: int, params_dtype: torch.dtype,
+                   **extra_weight_attrs):
+    from vllm_ascend.ascend_config import get_ascend_config
+    ascend_config = get_ascend_config()
+    # This method creates unquantized linear weights.
+    # The weights are not quantized, and they are not sharded.
+    # The amount of memory allocated for the weights is
+    # sum(output_partition_sizes) * input_size_per_partition.
+    try:
+        if ascend_config.torchair_graph_config.enabled:
+            weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                           input_size_per_partition,
+                                           dtype=params_dtype),
+                               requires_grad=False)
+        else:
+            weight_loader = extra_weight_attrs.pop("weight_loader")
+            weight = ModelWeightParameter(data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype),
+                                          input_dim=1,
+                                          output_dim=0,
+                                          weight_loader=weight_loader)
+    except torch.cuda.OutOfMemoryError as e:
+        logger.error("Failed to create unquantized linear weights: %s", e)
+        if torch.cuda.is_available():
+            logger.debug("CUDA device: %s", torch.cuda.current_device())
+            logger.debug("Allocated: %.2f GiB",
+                         torch.cuda.memory_allocated() / GiB_bytes)
+            logger.debug("Reserved: %.2f GiB",
+                         torch.cuda.memory_reserved() / GiB_bytes)
+        raise RuntimeError(
+            "Failed to create unquantized linear weights. "
+            "This may be caused by insufficient memory to allocate "
+            "the weight.") from e
+    if ascend_config.torchair_graph_config.enabled:
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+    layer.register_parameter("weight", weight)
+    set_weight_attrs(weight, extra_weight_attrs)
+
+
+if not vllm_version_is("0.10.2"):
+    from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+    UnquantizedLinearMethod.create_weights = create_weights
diff --git a/vllm_ascend/torchair/torchair_worker.py b/vllm_ascend/torchair/torchair_worker.py
index dbee800..ec3f1aa 100644
--- a/vllm_ascend/torchair/torchair_worker.py
+++ b/vllm_ascend/torchair/torchair_worker.py
@@ -28,6 +28,20 @@ from vllm_ascend.worker.worker_v1 import NPUWorker
 class NPUTorchairWorker(NPUWorker):
     """Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class."""
 
+    def __init__(self,
+                 vllm_config,
+                 local_rank,
+                 rank,
+                 distributed_init_method,
+                 is_driver_worker=False,
+                 **kwargs):
+        super().__init__(vllm_config, local_rank, rank,
+                         distributed_init_method, is_driver_worker, **kwargs)
+        from vllm.model_executor.layers.linear import \
+            WEIGHT_LOADER_V2_SUPPORTED
+        if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
+            WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
+
     def determine_available_memory(self) -> int:
         """Override determine_available_memory to use cached torchair kv_cache_bytes."""
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index f4656dd..670e69a 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -304,17 +304,24 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                 (self.max_num_tokens, self.model_config.get_hidden_size()),
                 dtype=self.dtype,
                 device=self.device)
-
         # Set up Attention
-        self.attn_backend = get_attn_backend(
-            0,
-            self.dtype,
-            None,
-            self.block_size,
-            self.model_config.is_attention_free,
-            use_mla=self.model_config.use_mla,
-        )
-
+        if vllm_version_is("0.10.2"):
+            self.attn_backend = get_attn_backend(
+                0,
+                self.dtype,
+                None,
+                self.block_size,
+                self.model_config.is_attention_free,
+                use_mla=self.model_config.use_mla,
+            )
+        else:
+            self.attn_backend = get_attn_backend(
+                0,
+                self.dtype,
+                None,
+                self.block_size,
+                use_mla=self.model_config.use_mla,
+            )
         if torch.version.cann.startswith("8.3"):
             self.attn_mask_builder = AttentionMaskBuilder(
                 self.scheduler_config.max_num_batched_tokens, self.dtype,