[CI] Upgrade vLLM version (#3139)

Upgrade vLLM version to the newest commit. - Fix the break change introduced by 969b4da3a6 - Add a patch to quick fix torhcair de94289a98 - fix the ut error introduced by de94289a98 Close: https://github.com/vllm-project/vllm-ascend/issues/3138 - vLLM version: v0.10.2 - vLLM main: f225ea7dd9 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: MengqingCao <cmq0113@163.com>
2025-09-25 07:36:51 +08:00
parent 464270e4ca
commit a055183821
9 changed files with 105 additions and 15 deletions
--- a/.github/workflows/format_pr_body.yaml
+++ b/.github/workflows/format_pr_body.yaml
@@ -36,7 +36,7 @@ jobs:
      - name: Get vLLM version
        run: |
-          VLLM_COMMIT=f225ea7dd98e9f29752e5c032cd4a8ee1d712f16
+          VLLM_COMMIT=b1068903fdca26cf6b4a1a51a32c3365ce3ac636
          echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
      - name: Checkout repository
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -42,7 +42,7 @@ jobs:
  lint:
    uses: ./.github/workflows/pre-commit.yml
    with:
-      vllm: f225ea7dd98e9f29752e5c032cd4a8ee1d712f16
+      vllm: b1068903fdca26cf6b4a1a51a32c3365ce3ac636
  changes:
    runs-on: ubuntu-latest
@@ -83,7 +83,7 @@ jobs:
        VLLM_USE_MODELSCOPE: True
    strategy:
      matrix:
-        vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2]
+        vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
    steps:
      - name: Install packages
        run: |
@@ -138,7 +138,7 @@ jobs:
    name: e2e-light
    strategy:
      matrix:
-        vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2]
+        vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
    # Note (yikun): If CI resource are limited we can split job into two chain jobs
    needs: [lint, changes]
    # only trigger e2e test after lint passed and the change is e2e related with pull request.
--- a/.github/workflows/vllm_ascend_test_full.yaml
+++ b/.github/workflows/vllm_ascend_test_full.yaml
@@ -68,7 +68,7 @@ jobs:
    name: e2e-full
    strategy:
      matrix:
-        vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2]
+        vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
    needs: [changes]
    if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
    uses: ./.github/workflows/_e2e_test.yaml
--- a/tests/ut/models/test_qwen2_5_vl.py
+++ b/tests/ut/models/test_qwen2_5_vl.py
@@ -298,6 +298,10 @@ class TestAscendQwen2_5_VisionTransformer(PytestBase):
            "vllm_ascend.ops.linear_op.get_tp_group",
            return_value=mock_group,
        )
        mocker.patch(
            "vllm.distributed.parallel_state.get_tp_group",
            return_value=mock_group,
        )
        vision_transformer = AscendQwen2_5_VisionTransformer(
            vision_config,
--- a/tests/ut/ops/test_linear.py
+++ b/tests/ut/ops/test_linear.py
@@ -33,6 +33,10 @@ class BaseLinearTest(unittest.TestCase):
                  return_value=self.mock_group),
            patch("vllm_ascend.ops.linear_op.get_tp_group",
                  return_value=self.mock_group),
            patch(
                "vllm.distributed.parallel_state.get_tp_group",
                return_value=self.mock_group,
            ),
            patch("vllm_ascend.utils.mlp_tp_enable", return_value=True),
            patch("vllm_ascend.utils.oproj_tp_enable", return_value=True)
        ]
--- a/vllm_ascend/patch/worker/patch_common/init.py
+++ b/vllm_ascend/patch/worker/patch_common/init.py
@@ -22,6 +22,7 @@ if HAS_TRITON:
 import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_logits  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_weight_loader  # noqa
 # TODO: revert me when triton import is fixed
 # import vllm_ascend.patch.worker.patch_common.patch_minicpm  # noqa
--- a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
@@ -0,0 +1,60 @@
 import torch
 from torch.nn.parameter import Parameter
 from vllm.logger import init_logger
 # yapf: disable
 from vllm.model_executor.parameter import ModelWeightParameter
 # yapf: enable
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils import GiB_bytes
 from vllm_ascend.utils import vllm_version_is
 logger = init_logger(__name__)
 def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
                   output_partition_sizes: list[int], input_size: int,
                   output_size: int, params_dtype: torch.dtype,
                   **extra_weight_attrs):
    from vllm_ascend.ascend_config import get_ascend_config
    ascend_config = get_ascend_config()
    # This method creates unquantized linear weights.
    # The weights are not quantized, and they are not sharded.
    # The amount of memory allocated for the weights is
    # sum(output_partition_sizes) * input_size_per_partition.
    try:
        if ascend_config.torchair_graph_config.enabled:
            weight = Parameter(torch.empty(sum(output_partition_sizes),
                                           input_size_per_partition,
                                           dtype=params_dtype),
                               requires_grad=False)
        else:
            weight_loader = extra_weight_attrs.pop("weight_loader")
            weight = ModelWeightParameter(data=torch.empty(
                sum(output_partition_sizes),
                input_size_per_partition,
                dtype=params_dtype),
                                          input_dim=1,
                                          output_dim=0,
                                          weight_loader=weight_loader)
    except torch.cuda.OutOfMemoryError as e:
        logger.error("Failed to create unquantized linear weights: %s", e)
        if torch.cuda.is_available():
            logger.debug("CUDA device: %s", torch.cuda.current_device())
            logger.debug("Allocated: %.2f GiB",
                         torch.cuda.memory_allocated() / GiB_bytes)
            logger.debug("Reserved: %.2f GiB",
                         torch.cuda.memory_reserved() / GiB_bytes)
        raise RuntimeError(
            "Failed to create unquantized linear weights. "
            "This may be caused by insufficient memory to allocate "
            "the weight.") from e
    if ascend_config.torchair_graph_config.enabled:
        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
    layer.register_parameter("weight", weight)
    set_weight_attrs(weight, extra_weight_attrs)
 if not vllm_version_is("0.10.2"):
    from vllm.model_executor.layers.linear import UnquantizedLinearMethod
    UnquantizedLinearMethod.create_weights = create_weights
--- a/vllm_ascend/torchair/torchair_worker.py
+++ b/vllm_ascend/torchair/torchair_worker.py
@@ -28,6 +28,20 @@ from vllm_ascend.worker.worker_v1 import NPUWorker
 class NPUTorchairWorker(NPUWorker):
    """Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class."""
    def __init__(self,
                 vllm_config,
                 local_rank,
                 rank,
                 distributed_init_method,
                 is_driver_worker=False,
                 **kwargs):
        super().__init__(vllm_config, local_rank, rank,
                         distributed_init_method, is_driver_worker, **kwargs)
        from vllm.model_executor.layers.linear import \
            WEIGHT_LOADER_V2_SUPPORTED
        if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
            WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
    def determine_available_memory(self) -> int:
        """Override determine_available_memory to use cached torchair kv_cache_bytes."""
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -304,17 +304,24 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                (self.max_num_tokens, self.model_config.get_hidden_size()),
                dtype=self.dtype,
                device=self.device)
        # Set up Attention
-        self.attn_backend = get_attn_backend(
+        if vllm_version_is("0.10.2"):
-            0,
+            self.attn_backend = get_attn_backend(
-            self.dtype,
+                0,
-            None,
+                self.dtype,
-            self.block_size,
+                None,
-            self.model_config.is_attention_free,
+                self.block_size,
-            use_mla=self.model_config.use_mla,
+                self.model_config.is_attention_free,
-        )
+                use_mla=self.model_config.use_mla,
-
+            )
        else:
            self.attn_backend = get_attn_backend(
                0,
                self.dtype,
                None,
                self.block_size,
                use_mla=self.model_config.use_mla,
            )
        if torch.version.cann.startswith("8.3"):
            self.attn_mask_builder = AttentionMaskBuilder(
                self.scheduler_config.max_num_batched_tokens, self.dtype,