[CI] Upgrade vLLM version (#3139)

Upgrade vLLM version to the newest commit. - Fix the break change introduced by 969b4da3a6 - Add a patch to quick fix torhcair de94289a98 - fix the ut error introduced by de94289a98 Close: https://github.com/vllm-project/vllm-ascend/issues/3138 - vLLM version: v0.10.2 - vLLM main: f225ea7dd9 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: MengqingCao <cmq0113@163.com>
2025-09-25 07:36:51 +08:00
parent 464270e4ca
commit a055183821
9 changed files with 105 additions and 15 deletions
--- a/.github/workflows/format_pr_body.yaml
+++ b/.github/workflows/format_pr_body.yaml
@@ -36,7 +36,7 @@ jobs:

      - name: Get vLLM version
        run: |
-          VLLM_COMMIT=f225ea7dd98e9f29752e5c032cd4a8ee1d712f16
+          VLLM_COMMIT=b1068903fdca26cf6b4a1a51a32c3365ce3ac636
          echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV

      - name: Checkout repository
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -42,7 +42,7 @@ jobs:
  lint:
    uses: ./.github/workflows/pre-commit.yml
    with:
-      vllm: f225ea7dd98e9f29752e5c032cd4a8ee1d712f16
+      vllm: b1068903fdca26cf6b4a1a51a32c3365ce3ac636

  changes:
    runs-on: ubuntu-latest
@@ -83,7 +83,7 @@ jobs:
        VLLM_USE_MODELSCOPE: True
    strategy:
      matrix:
-        vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2]
+        vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
    steps:
      - name: Install packages
        run: |
@@ -138,7 +138,7 @@ jobs:
    name: e2e-light
    strategy:
      matrix:
-        vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2]
+        vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
    # Note (yikun): If CI resource are limited we can split job into two chain jobs
    needs: [lint, changes]
    # only trigger e2e test after lint passed and the change is e2e related with pull request.
--- a/.github/workflows/vllm_ascend_test_full.yaml
+++ b/.github/workflows/vllm_ascend_test_full.yaml
@@ -68,7 +68,7 @@ jobs:
    name: e2e-full
    strategy:
      matrix:
-        vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2]
+        vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
    needs: [changes]
    if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
    uses: ./.github/workflows/_e2e_test.yaml
--- a/tests/ut/models/test_qwen2_5_vl.py
+++ b/tests/ut/models/test_qwen2_5_vl.py
@@ -298,6 +298,10 @@ class TestAscendQwen2_5_VisionTransformer(PytestBase):
            "vllm_ascend.ops.linear_op.get_tp_group",
            return_value=mock_group,
        )
+        mocker.patch(
+            "vllm.distributed.parallel_state.get_tp_group",
+            return_value=mock_group,
+        )

        vision_transformer = AscendQwen2_5_VisionTransformer(
            vision_config,
--- a/tests/ut/ops/test_linear.py
+++ b/tests/ut/ops/test_linear.py
@@ -33,6 +33,10 @@ class BaseLinearTest(unittest.TestCase):
                  return_value=self.mock_group),
            patch("vllm_ascend.ops.linear_op.get_tp_group",
                  return_value=self.mock_group),
+            patch(
+                "vllm.distributed.parallel_state.get_tp_group",
+                return_value=self.mock_group,
+            ),
            patch("vllm_ascend.utils.mlp_tp_enable", return_value=True),
            patch("vllm_ascend.utils.oproj_tp_enable", return_value=True)
        ]
--- a/vllm_ascend/patch/worker/patch_common/init.py
+++ b/vllm_ascend/patch/worker/patch_common/init.py
@@ -22,6 +22,7 @@ if HAS_TRITON:

 import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_logits  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_weight_loader  # noqa

 # TODO: revert me when triton import is fixed
 # import vllm_ascend.patch.worker.patch_common.patch_minicpm  # noqa
--- a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
@@ -0,0 +1,60 @@
+import torch
+from torch.nn.parameter import Parameter
+from vllm.logger import init_logger
+# yapf: disable
+from vllm.model_executor.parameter import ModelWeightParameter
+# yapf: enable
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.utils import GiB_bytes
+
+from vllm_ascend.utils import vllm_version_is
+
+logger = init_logger(__name__)
+
+
+def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
+                   output_partition_sizes: list[int], input_size: int,
+                   output_size: int, params_dtype: torch.dtype,
+                   **extra_weight_attrs):
+    from vllm_ascend.ascend_config import get_ascend_config
+    ascend_config = get_ascend_config()
+    # This method creates unquantized linear weights.
+    # The weights are not quantized, and they are not sharded.
+    # The amount of memory allocated for the weights is
+    # sum(output_partition_sizes) * input_size_per_partition.
+    try:
+        if ascend_config.torchair_graph_config.enabled:
+            weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                           input_size_per_partition,
+                                           dtype=params_dtype),
+                               requires_grad=False)
+        else:
+            weight_loader = extra_weight_attrs.pop("weight_loader")
+            weight = ModelWeightParameter(data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype),
+                                          input_dim=1,
+                                          output_dim=0,
+                                          weight_loader=weight_loader)
+    except torch.cuda.OutOfMemoryError as e:
+        logger.error("Failed to create unquantized linear weights: %s", e)
+        if torch.cuda.is_available():
+            logger.debug("CUDA device: %s", torch.cuda.current_device())
+            logger.debug("Allocated: %.2f GiB",
+                         torch.cuda.memory_allocated() / GiB_bytes)
+            logger.debug("Reserved: %.2f GiB",
+                         torch.cuda.memory_reserved() / GiB_bytes)
+        raise RuntimeError(
+            "Failed to create unquantized linear weights. "
+            "This may be caused by insufficient memory to allocate "
+            "the weight.") from e
+    if ascend_config.torchair_graph_config.enabled:
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+    layer.register_parameter("weight", weight)
+    set_weight_attrs(weight, extra_weight_attrs)
+
+
+if not vllm_version_is("0.10.2"):
+    from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+    UnquantizedLinearMethod.create_weights = create_weights
--- a/vllm_ascend/torchair/torchair_worker.py
+++ b/vllm_ascend/torchair/torchair_worker.py
@@ -28,6 +28,20 @@ from vllm_ascend.worker.worker_v1 import NPUWorker
 class NPUTorchairWorker(NPUWorker):
    """Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class."""

+    def __init__(self,
+                 vllm_config,
+                 local_rank,
+                 rank,
+                 distributed_init_method,
+                 is_driver_worker=False,
+                 **kwargs):
+        super().__init__(vllm_config, local_rank, rank,
+                         distributed_init_method, is_driver_worker, **kwargs)
+        from vllm.model_executor.layers.linear import \
+            WEIGHT_LOADER_V2_SUPPORTED
+        if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
+            WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
+
    def determine_available_memory(self) -> int:
        """Override determine_available_memory to use cached torchair kv_cache_bytes."""

--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -304,17 +304,24 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                (self.max_num_tokens, self.model_config.get_hidden_size()),
                dtype=self.dtype,
                device=self.device)
-
        # Set up Attention
-        self.attn_backend = get_attn_backend(
-            0,
-            self.dtype,
-            None,
-            self.block_size,
-            self.model_config.is_attention_free,
-            use_mla=self.model_config.use_mla,
-        )
-
+        if vllm_version_is("0.10.2"):
+            self.attn_backend = get_attn_backend(
+                0,
+                self.dtype,
+                None,
+                self.block_size,
+                self.model_config.is_attention_free,
+                use_mla=self.model_config.use_mla,
+            )
+        else:
+            self.attn_backend = get_attn_backend(
+                0,
+                self.dtype,
+                None,
+                self.block_size,
+                use_mla=self.model_config.use_mla,
+            )
        if torch.version.cann.startswith("8.3"):
            self.attn_mask_builder = AttentionMaskBuilder(
                self.scheduler_config.max_num_batched_tokens, self.dtype,