From 2930e4a6bdcc6405c9924c35d70c5bc00f844b04 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Fri, 26 Sep 2025 06:18:15 +0800
Subject: [PATCH] [CI] Upgrade vllm to newest commit (#3182)

### What this PR does / why we need it?
Upgrade vLLM to newest commit

- Fix the aclgraph doesn't work problem, caused by
https://github.com/vllm-project/vllm/commit/24fab45d96a91b491db338ee02cd24e55b7fbb5f
- Fix PoolerOutput import error, caused by
https://github.com/vllm-project/vllm/commit/755ed7b05be4743237d3339c4ff8c22bcaae04f4
- Fix the aclgraph weight load error to keep the same with torchair fix.
https://github.com/vllm-project/vllm/commit/4492e3a55428e161ca8db381edc28263e5da4c8d

### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
All test should pass


- vLLM version: v0.10.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/52d0cb845866869d587fc013a7c59e60a86ebcf2

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 .github/workflows/format_pr_body.yaml         |  2 +-
 .github/workflows/vllm_ascend_test.yaml       |  6 ++--
 .github/workflows/vllm_ascend_test_full.yaml  |  2 +-
 .../patch_common/patch_weight_loader.py       | 26 +++-----------
 vllm_ascend/platform.py                       |  5 +++
 vllm_ascend/quantization/quant_config.py      |  5 +--
 vllm_ascend/torchair/torchair_worker.py       | 14 --------
 vllm_ascend/worker/model_runner_v1.py         | 36 ++++++++++++++-----
 vllm_ascend/worker/worker_v1.py               |  6 ++++
 9 files changed, 49 insertions(+), 53 deletions(-)

diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml
index 7fc23ee..a99d812 100644
--- a/.github/workflows/format_pr_body.yaml
+++ b/.github/workflows/format_pr_body.yaml
@@ -36,7 +36,7 @@ jobs:
 
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=52d0cb845866869d587fc013a7c59e60a86ebcf2
+          VLLM_COMMIT=17b4c6685ce62d5652654784d6771a3d38e4273e
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
 
       - name: Checkout repository
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index d1e1af5..a45d676 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -42,7 +42,7 @@ jobs:
   lint:
     uses: ./.github/workflows/pre-commit.yml
     with:
-      vllm: 52d0cb845866869d587fc013a7c59e60a86ebcf2
+      vllm: 17b4c6685ce62d5652654784d6771a3d38e4273e
 
   changes:
     runs-on: ubuntu-latest
@@ -83,7 +83,7 @@ jobs:
         VLLM_USE_MODELSCOPE: True
     strategy:
       matrix:
-        vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2]
+        vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2]
     steps:
       - name: Install packages
         run: |
@@ -138,7 +138,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2]
+        vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml
index 1d628dd..6306032 100644
--- a/.github/workflows/vllm_ascend_test_full.yaml
+++ b/.github/workflows/vllm_ascend_test_full.yaml
@@ -68,7 +68,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2]
+        vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
index 4bbd6d3..10705d3 100644
--- a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
+++ b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
@@ -1,9 +1,6 @@
 import torch
 from torch.nn.parameter import Parameter
 from vllm.logger import init_logger
-# yapf: disable
-from vllm.model_executor.parameter import ModelWeightParameter
-# yapf: enable
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils import GiB_bytes
 
@@ -16,27 +13,15 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
                    output_partition_sizes: list[int], input_size: int,
                    output_size: int, params_dtype: torch.dtype,
                    **extra_weight_attrs):
-    from vllm_ascend.ascend_config import get_ascend_config
-    ascend_config = get_ascend_config()
     # This method creates unquantized linear weights.
     # The weights are not quantized, and they are not sharded.
     # The amount of memory allocated for the weights is
     # sum(output_partition_sizes) * input_size_per_partition.
     try:
-        if ascend_config.torchair_graph_config.enabled:
-            weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                           input_size_per_partition,
-                                           dtype=params_dtype),
-                               requires_grad=False)
-        else:
-            weight_loader = extra_weight_attrs.pop("weight_loader")
-            weight = ModelWeightParameter(data=torch.empty(
-                sum(output_partition_sizes),
-                input_size_per_partition,
-                dtype=params_dtype),
-                                          input_dim=1,
-                                          output_dim=0,
-                                          weight_loader=weight_loader)
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=params_dtype),
+                           requires_grad=False)
     except torch.cuda.OutOfMemoryError as e:
         logger.error("Failed to create unquantized linear weights: %s", e)
         if torch.cuda.is_available():
@@ -49,8 +34,7 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
             "Failed to create unquantized linear weights. "
             "This may be caused by insufficient memory to allocate "
             "the weight.") from e
-    if ascend_config.torchair_graph_config.enabled:
-        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+    set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
     layer.register_parameter("weight", weight)
     set_weight_attrs(weight, extra_weight_attrs)
 
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index f00abca..f25f984 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -209,6 +209,11 @@ class NPUPlatform(Platform):
         # set cudaprah sizes before extending `compilation_config.splitting_ops`
         vllm_config._set_cudagraph_sizes()
 
+        # TODO: Full graph is fully supported later, and the default value will be set to full graph.
+        if not vllm_version_is("v0.10.2"):
+            if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
+                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+
         if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
             compilation_config.level = CompilationLevel.NO_COMPILATION
         # TODO: Currently MLA does not support FULL_DECODE_ONLY, remove the second condition
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index 1a5e74d..130251c 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -33,7 +33,6 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     UnquantizedEmbeddingMethod, VocabParallelEmbedding)
-from vllm.model_executor.parameter import PerTensorScaleParameter
 from vllm.model_executor.utils import set_weight_attrs
 
 from vllm_ascend.distributed.parallel_state import (get_mlp_tp_group,
@@ -251,7 +250,6 @@ class AscendLinearMethod(LinearMethodBase):
         **extra_weight_attrs,
     ) -> None:
         output_size_per_partition = sum(output_partition_sizes)
-        weight_loader = extra_weight_attrs.get("weight_loader")
 
         weight_dict = self.quant_method.get_weight(input_size_per_partition,
                                                    output_size_per_partition,
@@ -264,8 +262,7 @@ class AscendLinearMethod(LinearMethodBase):
 
         pertensor_dict = self.quant_method.get_pertensor_param(params_dtype)
         for pertensor_name, pertensor_param in pertensor_dict.items():
-            param = PerTensorScaleParameter(data=pertensor_param,
-                                            weight_loader=weight_loader)
+            param = torch.nn.Parameter(pertensor_param, requires_grad=False)
             # disable warning
             param.ignore_warning = True
             layer.register_parameter(pertensor_name, param)
diff --git a/vllm_ascend/torchair/torchair_worker.py b/vllm_ascend/torchair/torchair_worker.py
index ec3f1aa..dbee800 100644
--- a/vllm_ascend/torchair/torchair_worker.py
+++ b/vllm_ascend/torchair/torchair_worker.py
@@ -28,20 +28,6 @@ from vllm_ascend.worker.worker_v1 import NPUWorker
 class NPUTorchairWorker(NPUWorker):
     """Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class."""
 
-    def __init__(self,
-                 vllm_config,
-                 local_rank,
-                 rank,
-                 distributed_init_method,
-                 is_driver_worker=False,
-                 **kwargs):
-        super().__init__(vllm_config, local_rank, rank,
-                         distributed_init_method, is_driver_worker, **kwargs)
-        from vllm.model_executor.layers.linear import \
-            WEIGHT_LOADER_V2_SUPPORTED
-        if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
-            WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
-
     def determine_available_memory(self) -> int:
         """Override determine_available_memory to use cached torchair kv_cache_bytes."""
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 124fdcd..4d9e338 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -64,11 +64,12 @@ from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingType
-from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.sequence import IntermediateTensors
 from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         LazyLoader, cdiv, get_dtype_size,
                         is_pin_memory_available)
+from vllm.utils.jsontree import json_map_leaves
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport, reorder_batch_to_split_decodes_and_prefills)
@@ -144,7 +145,9 @@ else:
 
 if not vllm_version_is("0.10.2"):
     from vllm.v1.kv_cache_interface import UniformTypeKVCacheSpecs
+    from vllm.v1.outputs import PoolerOutput
 else:
+    from vllm.sequence import PoolerOutput
     UniformTypeKVCacheSpecs = None
 
 
@@ -1806,18 +1809,30 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                                               device=hidden_states.device)
         seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs]
 
-        # Pooling models D2H & synchronize occurs in pooler.py:build_output
-        raw_pooler_output = self.model.pooler(
-            hidden_states=hidden_states, pooling_metadata=pooling_metadata)
+        if vllm_version_is("0.10.2"):
+            # Pooling models D2H & synchronize occurs in pooler.py:build_output
+            raw_pooler_output = self.model.pooler(
+                hidden_states=hidden_states, pooling_metadata=pooling_metadata)
+        else:
+            model = cast(VllmModelForPooling, self.model)
+            raw_pooler_output = model.pooler(
+                hidden_states=hidden_states,
+                pooling_metadata=pooling_metadata,
+            )
+            raw_pooler_output = json_map_leaves(
+                lambda x: x.to("cpu", non_blocking=True),
+                raw_pooler_output,
+            )
+            torch.npu.synchronize()
 
         pooler_output: list[Optional[torch.Tensor]] = []
         for raw_output, seq_len, prompt_len in zip(
                 raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
-
-            if seq_len == prompt_len:
-                pooler_output.append(raw_output.data)
+            if vllm_version_is("0.10.2"):
+                output = raw_output.data if seq_len == prompt_len else None
             else:
-                pooler_output.append(None)
+                output = raw_output if seq_len == prompt_len else None
+            pooler_output.append(output)
 
         return ModelRunnerOutput(
             req_ids=self.input_batch.req_ids,
@@ -2582,7 +2597,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         for task in self.get_supported_pooling_tasks():
             # Run a full batch with each task to ensure none of them OOMs
             output = self._dummy_pooler_run_task(hidden_states, task)
-            output_size[task] = output.get_data_nbytes()
+            if vllm_version_is("0.10.2"):
+                output_size[task] = output.get_data_nbytes()
+            else:
+                output_size[task] = sum(o.nbytes for o in output)
             del output  # Allow GC
 
         max_task = max(output_size.items(), key=lambda x: x[1])[0]
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
index fedec87..c1fc800 100644
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -116,6 +116,12 @@ class NPUWorker(WorkerBase):
             # Buffers saved before sleep
             self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
 
+        # FixMe: this is a patch to fix the issue cause by https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170
+        from vllm.model_executor.layers.linear import \
+            WEIGHT_LOADER_V2_SUPPORTED
+        if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
+            WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
+
     def sleep(self, level: int = 1) -> None:
         if not sleep_mode_enabled():
             raise ValueError(