Upgrade vLLM to v0.10.0 (#1927)

### What this PR does / why we need it? - Upgrade to v0.10.0 - Drop v0.9.2 version compatibility - Add patch for `vllm_ascend/patch/worker/patch_common/patch_sampler_gather_logprobs.py` as workaround of f3a683b7c9 for v0.10.0 and also add e2e test `test_models_prompt_logprobs` - Pin transformers<4.54.0 as workaround of https://github.com/vllm-project/vllm-ascend/issues/2034 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Test locally: `VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/test_offline_inference.py::test_models_prompt_logprobs` - CI passed - vLLM version: v0.9.2 - vLLM main: 7728dd77bb --------- Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
2025-07-26 15:43:29 +08:00
parent 2f50304c19
commit 17a430f7b8
29 changed files with 198 additions and 251 deletions
--- a/.github/workflows/accuracy_test.yaml
+++ b/.github/workflows/accuracy_test.yaml
@@ -37,7 +37,7 @@ on:
        # Current supported vLLM versions
        options:
          - main
-          - v0.9.2
+          - v0.10.0
          - v0.9.1
          - v0.7.3
      vllm-ascend-version:
@@ -163,7 +163,7 @@ jobs:
          repository: vllm-project/vllm
          path: ./vllm-empty
          # Please also update this when bump matched version
-          ref: ${{ github.event.inputs.vllm-version || 'v0.9.2' }}
+          ref: ${{ github.event.inputs.vllm-version || 'v0.10.0' }}

      - name: Install vllm-project/vllm from source
        working-directory: ./vllm-empty
--- a/.github/workflows/nightly_benchmarks.yaml
+++ b/.github/workflows/nightly_benchmarks.yaml
@@ -51,7 +51,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - vllm_branch: v0.9.2
+          - vllm_branch: v0.10.0
            vllm_ascend_branch: main
            vllm_use_v1: 1
      max-parallel: 1
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -81,7 +81,7 @@ jobs:
        VLLM_USE_MODELSCOPE: True
    strategy:
      matrix:
-        vllm_version: [main, v0.9.2]
+        vllm_version: [main, v0.10.0]
    steps:
      - name: Install packages
        run: |
@@ -137,7 +137,7 @@ jobs:
      max-parallel: 2
      matrix:
        os: [linux-arm64-npu-1]
-        vllm_version: [main, v0.9.2]
+        vllm_version: [main, v0.10.0]
    name: singlecard e2e test
    runs-on: ${{ matrix.os }}
    container:
@@ -216,7 +216,7 @@ jobs:
      max-parallel: 1
      matrix:
        os: [linux-arm64-npu-4]
-        vllm_version: [main, v0.9.2]
+        vllm_version: [main, v0.10.0]
    name: multicard e2e test
    runs-on: ${{ matrix.os }}
    container:
--- a/.github/workflows/vllm_ascend_test_long_term.yaml
+++ b/.github/workflows/vllm_ascend_test_long_term.yaml
@@ -43,7 +43,7 @@ jobs:
      max-parallel: 2
      matrix:
        os: [linux-arm64-npu-1, linux-arm64-npu-4]
-        vllm_version: [main, v0.9.2]
+        vllm_version: [main, v0.10.0]
    name: vLLM Ascend long term test
    runs-on: ${{ matrix.os }}
    container:
--- a/2
+++ b/2
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.2
+ARG VLLM_TAG=v0.10.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
--- a/Dockerfile.310p
+++ b/Dockerfile.310p
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.2
+ARG VLLM_TAG=v0.10.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
--- a/Dockerfile.310p.openEuler
+++ b/Dockerfile.310p.openEuler
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.2
+ARG VLLM_TAG=v0.10.0

 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
--- a/Dockerfile.a3
+++ b/Dockerfile.a3
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.2
+ARG VLLM_TAG=v0.10.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
--- a/Dockerfile.a3.openEuler
+++ b/Dockerfile.a3.openEuler
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.2
+ARG VLLM_TAG=v0.10.0

 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.2
+ARG VLLM_TAG=v0.10.0

 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -77,7 +77,7 @@ myst_substitutions = {
    # CANN image tag
    'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10",
    # vllm version in ci
-    'ci_vllm_version': 'v0.9.2',
+    'ci_vllm_version': 'v0.10.0',
 }

 # Add any paths that contain templates here, relative to this directory.
--- a/docs/source/developer_guide/feature_guide/patch.md
+++ b/docs/source/developer_guide/feature_guide/patch.md
@@ -38,15 +38,15 @@ vllm_ascend

 In both **platform** and **worker** folder, there are several patch modules. They are used for patching different version of vLLM.

- `patch_0_9_2`: This module is used for patching vLLM 0.9.2. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_9_2` is used for patching vLLM 0.9.2.
+- `patch_0_10_0`: This module is used for patching vLLM 0.10.0. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_10_0` is used for patching vLLM 0.10.0.
 - `patch_main`: This module is used for patching the code in vLLM main branch.
- `patch_common`: This module is used for patching both vLLM 0.9.2 and vLLM main branch.
+- `patch_common`: This module is used for patching both vLLM 0.10.0 and vLLM main branch.

 ## How to write a patch

 Before writing a patch, following the principle above, we should patch the least code. If it's necessary, we can patch the code in either **platform** and **worker** folder. Here is an example to patch `distributed` module in vLLM.

-1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.9.2 and main of vLLM.
+1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.10.0 and main of vLLM.
 2. Decide which process we should patch. For example, here `distributed` belongs to the vLLM main process, so we should patch `platform`.
 3. Create the patch file in the right folder. The file should be named as `patch_{module_name}.py`. The example here is `vllm_ascend/patch/platform/patch_common/patch_distributed.py`.
 4. Write your patch code in the new file. Here is an example:
@@ -82,4 +82,4 @@ Before writing a patch, following the principle above, we should patch the least

 ## Limitation
 1. In V1 Engine, vLLM starts three kinds of process: Main process, EngineCore process and Worker process. Now vLLM Ascend only support patch the code in Main process and Worker process by default. If you want to patch the code runs in EngineCore process, you should patch EngineCore process entirely during setup, the entry code is here `vllm.v1.engine.core`. Please override `EngineCoreProc` and `DPEngineCoreProc` entirely.
-2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.9.2 should work.
+2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.10.0 should work.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,8 @@ requires = [
    "msgpack",
    "quart",
    "numba",
+    # Remove after https://github.com/vllm-project/vllm-ascend/issues/2034
+    "transformers<4.54.0",
 ]
 build-backend = "setuptools.build_meta"

--- a/requirements.txt
+++ b/requirements.txt
@@ -13,6 +13,8 @@ setuptools-scm>=8
 torch>=2.5.1
 torchvision<0.21.0
 wheel
+# Remove after https://github.com/vllm-project/vllm-ascend/issues/2034
+transformers<4.54.0

 # requirements for disaggregated prefill
 msgpack
--- a/tests/e2e/singlecard/test_offline_inference.py
+++ b/tests/e2e/singlecard/test_offline_inference.py
@@ -127,3 +127,19 @@ def test_models_topk() -> None:
                    enforce_eager=True,
                    gpu_memory_utilization=0.7) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)
+
+
+def test_models_prompt_logprobs() -> None:
+
+    example_prompts = [
+        "Hello, my name is",
+    ]
+
+    with VllmRunner("Qwen/Qwen2.5-0.5B-Instruct",
+                    max_model_len=8192,
+                    dtype="float16",
+                    enforce_eager=True,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_model.generate_greedy_logprobs(example_prompts,
+                                            max_tokens=5,
+                                            num_logprobs=1)
--- a/tests/ut/attention/test_attention_v1.py
+++ b/tests/ut/attention/test_attention_v1.py
@@ -3,15 +3,12 @@ from unittest.mock import MagicMock, patch
 import torch

 from tests.ut.base import TestBase
-from vllm_ascend.attention.attention_v1 import \
-    AscendAttentionBackendImpl092  # isort: skip
 from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,
                                                AscendAttentionBackendImpl,
                                                AscendAttentionMetadataBuilder,
                                                AscendAttentionState,
                                                AscendMetadata,
                                                CommonAttentionState)
-from vllm_ascend.utils import vllm_version_is


 class TestAscendAttentionBackend(TestBase):
@@ -20,12 +17,8 @@ class TestAscendAttentionBackend(TestBase):
        self.assertEqual(AscendAttentionBackend.get_name(), "ASCEND")

    def test_get_impl_cls(self):
-        if vllm_version_is("0.9.2"):
-            self.assertEqual(AscendAttentionBackend.get_impl_cls(),
-                             AscendAttentionBackendImpl092)
-        else:
-            self.assertEqual(AscendAttentionBackend.get_impl_cls(),
-                             AscendAttentionBackendImpl)
+        self.assertEqual(AscendAttentionBackend.get_impl_cls(),
+                         AscendAttentionBackendImpl)

    def test_get_metadata_cls(self):
        self.assertEqual(AscendAttentionBackend.get_metadata_cls(),
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -17,7 +17,7 @@

 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type

 import torch
 import torch_npu
@@ -31,7 +31,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch

 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
-                               nd_to_nz_2d, nd_to_nz_spec, vllm_version_is)
+                               nd_to_nz_2d, nd_to_nz_spec)


 class AscendAttentionBackend(AttentionBackend):
@@ -43,8 +43,6 @@ class AscendAttentionBackend(AttentionBackend):

    @staticmethod
    def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
-        if vllm_version_is("0.9.2"):
-            return AscendAttentionBackendImpl092
        return AscendAttentionBackendImpl

    @staticmethod
@@ -440,38 +438,6 @@ class AscendAttentionBackendImpl(AttentionImpl):
        return output.view(num_tokens, self.hidden_size)


-class AscendAttentionBackendImpl092(AscendAttentionBackendImpl):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        super().__init__(
-            num_heads=num_heads,
-            head_size=head_size,
-            scale=scale,
-            num_kv_heads=num_kv_heads,
-            alibi_slopes=alibi_slopes,
-            sliding_window=sliding_window,
-            kv_cache_dtype=kv_cache_dtype,
-            logits_soft_cap=logits_soft_cap,
-            attn_type=attn_type,
-            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
-            use_irope=use_irope,
-        )
-
-
 def unified_ascend_attention_with_output(
    query: torch.Tensor,
    key: torch.Tensor,
--- a/vllm_ascend/attention/attention_v1_torchair.py
+++ b/vllm_ascend/attention/attention_v1_torchair.py
@@ -16,7 +16,7 @@
 #

 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type

 import numpy as np
 import torch
@@ -29,7 +29,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch

 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
-                               nd_to_nz_2d, vllm_version_is)
+                               nd_to_nz_2d)


 class AscendAttentionTorchairBackend(AttentionBackend):
@@ -41,8 +41,6 @@ class AscendAttentionTorchairBackend(AttentionBackend):

    @staticmethod
    def get_impl_cls() -> Type["AscendAttentionTorchairBackendImpl"]:
-        if vllm_version_is("0.9.2"):
-            return AscendAttentionTorchairBackendImpl092
        return AscendAttentionTorchairBackendImpl

    @staticmethod
@@ -489,36 +487,3 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
                "to use ascend scheduler.")

        return output.view(num_tokens, self.hidden_size)
-
-
-class AscendAttentionTorchairBackendImpl092(AscendAttentionTorchairBackendImpl
-                                            ):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        super().__init__(
-            num_heads=num_heads,
-            head_size=head_size,
-            scale=scale,
-            num_kv_heads=num_kv_heads,
-            alibi_slopes=alibi_slopes,
-            sliding_window=sliding_window,
-            kv_cache_dtype=kv_cache_dtype,
-            logits_soft_cap=logits_soft_cap,
-            attn_type=attn_type,
-            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
-            use_irope=use_irope,
-        )
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -1,12 +1,11 @@
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
-                    TypeVar)
+from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar

 import numpy as np
 import torch
 import torch_npu
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
-                                              AttentionMetadata, AttentionType,
+                                              AttentionMetadata,
                                              MLAAttentionImpl)
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import get_current_vllm_config
@@ -22,7 +21,7 @@ from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
 from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
-from vllm_ascend.utils import npu_prefetch, vllm_version_is
+from vllm_ascend.utils import npu_prefetch
 from vllm_ascend.worker.npu_input_batch import InputBatch

 if TYPE_CHECKING:
@@ -54,8 +53,6 @@ class AscendMLABackend(AttentionBackend):

    @staticmethod
    def get_impl_cls() -> Type["MLAAttentionImpl"]:
-        if vllm_version_is("0.9.2"):
-            return AscendMLAImpl092
        return AscendMLAImpl


@@ -1212,34 +1209,3 @@ class AscendMLAImpl(MLAAttentionImpl):
                output[:num_decode_tokens] = output_decode

        return output_padded
-
-
-class AscendMLAImpl092(AscendMLAImpl):
-
-    def __init__(self,
-                 num_heads: int,
-                 head_size: int,
-                 scale: float,
-                 num_kv_heads: int,
-                 alibi_slopes: Optional[List[float]],
-                 sliding_window: Optional[int],
-                 kv_cache_dtype: str,
-                 blocksparse_params: Optional[Dict[str, Any]] = None,
-                 logits_soft_cap: Optional[float] = None,
-                 attn_type: str = AttentionType.DECODER,
-                 kv_sharing_target_layer_name: Optional[str] = None,
-                 use_irope: bool = False,
-                 **kwargs) -> None:
-        super().__init__(
-            num_heads=num_heads,
-            head_size=head_size,
-            scale=scale,
-            num_kv_heads=num_kv_heads,
-            alibi_slopes=alibi_slopes,
-            sliding_window=sliding_window,
-            kv_cache_dtype=kv_cache_dtype,
-            logits_soft_cap=logits_soft_cap,
-            attn_type=attn_type,
-            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
-            use_irope=use_irope,
-            **kwargs)
--- a/vllm_ascend/core/scheduler.py
+++ b/vllm_ascend/core/scheduler.py
@@ -32,8 +32,6 @@ from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager

-from vllm_ascend.utils import vllm_version_is
-

 class AscendScheduler(Scheduler):
    """This Scheduler extends vllm's original v1 scheduler
@@ -283,23 +281,12 @@ class AscendScheduler(Scheduler):
                    # allow the lower-priority requests to be scheduled.
                    req_index += 1
                    continue
-                if vllm_version_is("0.9.2"):
-                    num_draft_tokens = max(
-                        num_new_tokens + request.num_computed_tokens -
-                        request.num_tokens, 0)

                while True:
-                    if vllm_version_is("0.9.2"):
-                        new_blocks = self.kv_cache_manager.allocate_slots(
-                            request,
-                            num_new_tokens,
-                            num_draft_tokens=num_draft_tokens,
-                            num_lookahead_tokens=self.num_lookahead_tokens)
-                    else:
-                        new_blocks = self.kv_cache_manager.allocate_slots(
-                            request,
-                            num_new_tokens,
-                            num_lookahead_tokens=self.num_lookahead_tokens)
+                    new_blocks = self.kv_cache_manager.allocate_slots(
+                        request,
+                        num_new_tokens,
+                        num_lookahead_tokens=self.num_lookahead_tokens)
                    if new_blocks is None:
                        # The request cannot be scheduled.
                        # Preempt the lowest-priority request.
--- a/vllm_ascend/patch/init.py
+++ b/vllm_ascend/patch/init.py
@@ -24,9 +24,9 @@
 #           each worker's `__init__` function.
 #
 # Then in each kind of patch, there are three folders:
-# - patch_0_9_2: contains the patches applied when vllm version is 0.9.2.
+# - patch_0_10_0: contains the patches applied when vllm version is 0.10.0.
 # - patch_main: contains the patches applied when vllm version is main branch.
-# - patch_common: contains the patches applied in both 0.9.2 and main branch.
+# - patch_common: contains the patches applied in both 0.10.0 and main branch.
 #
 # Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
 # ----------------------------------------------------------------------------------
@@ -101,3 +101,16 @@
 #       - https://github.com/vllm-project/vllm-ascend/pull/1732
 #    Future Plan:
 #       Revert it when the ascend scatter performance improves.
+#
+# ** File: worker/patch_common/patch_sampler.py **
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.v1.sample.sampler.Sampler.gather_logprobs`
+#    Why:
+#       We need to patch gather_logprobs to make sure call batched_count_greater_than
+#       with backend=current_platform.simple_compile_backend
+#    How：
+#       Patch gather_logprobs call new batched_count_greater_than
+#    Related PR (if no, explain why):
+#       - https://github.com/vllm-project/vllm/pull/21591
+#    Future Plan:
+#       Revert it when vLLM merge #21591 and release new version
--- a/vllm_ascend/patch/platform/init.py
+++ b/vllm_ascend/patch/platform/init.py
@@ -17,8 +17,8 @@
 from vllm_ascend.utils import vllm_version_is

 # Import specific patches for different versions
-if vllm_version_is("0.9.2"):
-    from vllm_ascend.patch.platform import patch_0_9_2  # noqa: F401
+if vllm_version_is("0.10.0"):
+    from vllm_ascend.patch.platform import patch_0_10_0  # noqa: F401
    from vllm_ascend.patch.platform import patch_common  # noqa: F401
 else:
    from vllm_ascend.patch.platform import patch_common  # noqa: F401
--- a/vllm_ascend/patch/platform/patch_0_10_0/init.py
+++ b/vllm_ascend/patch/platform/patch_0_10_0/init.py
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -18,8 +18,8 @@
 from vllm_ascend.utils import vllm_version_is

 # Import specific patches for different versions
-if vllm_version_is("0.9.2"):
-    from vllm_ascend.patch.worker import patch_0_9_2  # noqa: F401
+if vllm_version_is("0.10.0"):
+    from vllm_ascend.patch.worker import patch_0_10_0  # noqa: F401
    from vllm_ascend.patch.worker import patch_common  # noqa: F401
 else:
    from vllm_ascend.patch.worker import patch_common  # noqa: F401
--- a/vllm_ascend/patch/worker/patch_0_10_0/init.py
+++ b/vllm_ascend/patch/worker/patch_0_10_0/init.py
@@ -14,3 +14,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+import vllm_ascend.patch.worker.patch_0_10_0.patch_sampler_gather_logprobs  # noqa
--- a/vllm_ascend/patch/worker/patch_0_10_0/patch_sampler_gather_logprobs.py
+++ b/vllm_ascend/patch/worker/patch_0_10_0/patch_sampler_gather_logprobs.py
@@ -0,0 +1,87 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import torch
+from vllm.platforms import current_platform
+from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.sample.sampler import Sampler
+
+
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
+def batched_count_greater_than(x: torch.Tensor,
+                               values: torch.Tensor) -> torch.Tensor:
+    """
+    Counts elements in each row of x that are greater than the corresponding
+    value in values.  Use torch.compile to generate an optimized kernel for
+    this function. otherwise, it will create additional copies of the input
+    tensors and cause memory issues.
+    Args:
+        x (torch.Tensor): A 2D tensor of shape (batch_size, n_elements).
+        values (torch.Tensor): A 2D tensor of shape (batch_size, 1).
+    Returns:
+        torch.Tensor: A 1D tensor of shape (batch_size,) with the counts.
+    """
+    return (x >= values).sum(-1)
+
+
+def gather_logprobs(
+    self,
+    logprobs: torch.Tensor,
+    num_logprobs: int,
+    token_ids: torch.Tensor,
+) -> LogprobsTensors:
+    """
+    Gather logprobs for topk and sampled/prompt token.
+
+    Args:
+        logprobs: (num tokens) x (vocab) tensor
+        num_logprobs: minimum number of logprobs to
+                    retain per token
+        token_ids: prompt tokens (if prompt logprobs)
+                    or sampled tokens (if sampled
+                    logprobs); 1D token ID tensor
+                    with (num tokens) elements
+                    Must be int64.
+
+    Returns:
+        Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
+        Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
+        Sampled token rank tensor, (num tokens)
+    """
+    assert token_ids.dtype == torch.int64
+    # Find the topK values.
+    topk_logprobs, topk_indices = torch.topk(logprobs, num_logprobs, dim=-1)
+
+    # Get with the logprob of the prompt or sampled token.
+    token_ids = token_ids.unsqueeze(-1)
+    token_logprobs = logprobs.gather(-1, token_ids)
+
+    # Compute the ranks of the actual token.
+    token_ranks = batched_count_greater_than(logprobs, token_logprobs)
+
+    # Concatenate together with the topk.
+    indices = torch.cat((token_ids, topk_indices), dim=1)
+    logprobs = torch.cat((token_logprobs, topk_logprobs), dim=1)
+
+    # Use int32 to reduce the tensor size.
+    indices = indices.to(torch.int32)
+
+    return LogprobsTensors(indices, logprobs, token_ranks)
+
+
+Sampler.gather_logprobs = gather_logprobs
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -45,8 +45,9 @@ from vllm.logger import logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
-from vllm.model_executor.models.interfaces_base import (VllmModelForPooling,
-                                                        is_pooling_model)
+from vllm.model_executor.models.interfaces import supports_transcription
+from vllm.model_executor.models.interfaces_base import (
+    VllmModelForPooling, is_pooling_model, is_text_generation_model)
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
@@ -66,7 +67,7 @@ from vllm.v1.sample.sampler import Sampler
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
-from vllm.v1.worker.utils import (gather_mm_placeholders,
+from vllm.v1.worker.utils import (bind_kv_cache, gather_mm_placeholders,
                                  sanity_check_mm_encoder_outputs,
                                  scatter_mm_placeholders)

@@ -88,15 +89,8 @@ from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
 from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
 from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch

-if vllm_version_is("0.9.2"):
-    from vllm.model_executor.models.interfaces import has_step_pooler
-    from vllm.v1.utils import bind_kv_cache
-else:
-    from vllm.model_executor.models.interfaces import supports_transcription
-    from vllm.model_executor.models.interfaces_base import \
-        is_text_generation_model
+if not vllm_version_is("0.10.0"):
    from vllm.tasks import GenerationTask, SupportedTask
-    from vllm.v1.worker.utils import bind_kv_cache

 if TYPE_CHECKING:
    import xgrammar as xgr  # type: ignore[import-untyped]
@@ -409,7 +403,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            else:
                generator = None

-            if not vllm_version_is("0.9.2") and pooling_params:
+            if pooling_params:
                assert (task := pooling_params.task) is not None, (
                    "You did not set `task` in the API")
                model = cast(VllmModelForPooling, self.model)
@@ -585,10 +579,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):

        # OPTIMIZATION: Start copying the block table first.
        # This way, we can overlap the copy with the following CPU operations.
-        if vllm_version_is("0.9.2"):
-            self.input_batch.block_table.commit(num_reqs)
-        else:
-            self.input_batch.block_table.commit_block_table(num_reqs)
+        self.input_batch.block_table.commit_block_table(num_reqs)

        # Get the number of scheduled tokens for each request.
        req_ids = self.input_batch.req_ids
@@ -939,10 +930,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):

        # OPTIMIZATION: Start copying the block table first.
        # This way, we can overlap the copy with the following CPU operations.
-        if vllm_version_is("0.9.2"):
-            self.input_batch.block_table.commit(num_reqs)
-        else:
-            self.input_batch.block_table.commit_block_table(num_reqs)
+        self.input_batch.block_table.commit_block_table(num_reqs)

        # Get the number of scheduled tokens for each request.
        # TODO: The Python loop can be slow. Optimize.
@@ -1771,57 +1759,33 @@ class NPUModelRunner(LoRAModelRunnerMixin):

        req_num_tokens = num_tokens // num_reqs

-        if vllm_version_is("0.9.2"):
-            dummy_metadata = PoolingMetadata(
-                prompt_lens=torch.tensor(
-                    [h.shape[0] for h in hidden_states_list],
-                    device=self.device),
-                prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
-                                             dtype=torch.int32,
-                                             device=self.device),
-                pooling_params=[PoolingParams()] * num_reqs)
-            try:
-                pooler_output = self.model.pooler(
-                    hidden_states=hidden_states_list,
-                    pooling_metadata=dummy_metadata)
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    raise RuntimeError(
-                        "NPU out of memory occurred when warming up pooler with "
-                        f"{num_reqs} dummy requests. Please try lowering "
-                        "`max_num_seqs` or `gpu_memory_utilization` when "
-                        "initializing the engine.") from e
-                else:
-                    raise e
-        else:
-            model = cast(VllmModelForPooling, self.model)
-            dummy_task = self.get_supported_pooling_tasks()[0]
-            dummy_pooling_params = PoolingParams(task=dummy_task)
+        model = cast(VllmModelForPooling, self.model)
+        dummy_task = self.get_supported_pooling_tasks()[0]
+        dummy_pooling_params = PoolingParams(task=dummy_task)

-            to_update = model.pooler.get_pooling_updates(dummy_task)
-            to_update.apply(dummy_pooling_params)
+        to_update = model.pooler.get_pooling_updates(dummy_task)
+        to_update.apply(dummy_pooling_params)

-            dummy_metadata = PoolingMetadata(
-                prompt_lens=torch.tensor(
-                    [h.shape[0] for h in hidden_states_list],
-                    device=self.device),
-                prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
-                                             dtype=torch.int32,
-                                             device=self.device),
-                pooling_params=[dummy_pooling_params] * num_reqs)
+        dummy_metadata = PoolingMetadata(
+            prompt_lens=torch.tensor([h.shape[0] for h in hidden_states_list],
+                                     device=self.device),
+            prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
+                                         dtype=torch.int32,
+                                         device=self.device),
+            pooling_params=[dummy_pooling_params] * num_reqs)

-            try:
-                pooler_output = model.pooler(hidden_states=hidden_states_list,
-                                             pooling_metadata=dummy_metadata)
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    raise RuntimeError(
-                        "NPU out of memory occurred when warming up pooler with "
-                        f"{num_reqs} dummy requests. Please try lowering "
-                        "`max_num_seqs` or `gpu_memory_utilization` when "
-                        "initializing the engine.") from e
-                else:
-                    raise e
+        try:
+            pooler_output = model.pooler(hidden_states=hidden_states_list,
+                                         pooling_metadata=dummy_metadata)
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                raise RuntimeError(
+                    "NPU out of memory occurred when warming up pooler with "
+                    f"{num_reqs} dummy requests. Please try lowering "
+                    "`max_num_seqs` or `gpu_memory_utilization` when "
+                    "initializing the engine.") from e
+            else:
+                raise e

        return pooler_output

@@ -1841,9 +1805,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                                   QKVParallelLinear, RowParallelLinear)):
                        module.weight.data = torch_npu.npu_format_cast(
                            module.weight.data, ACL_FORMAT_FRACTAL_NZ)
-
-            if vllm_version_is("0.9.2") and has_step_pooler(self.model):
-                self.input_batch.logits_processing_needs_token_ids_bool = True
            if self.drafter:
                logger.info("Loading drafter model...")
                if isinstance(self.drafter, EagleProposer):
--- a/vllm_ascend/worker/npu_input_batch.py
+++ b/vllm_ascend/worker/npu_input_batch.py
@@ -35,8 +35,6 @@ from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable

-from vllm_ascend.utils import vllm_version_is
-
 _SAMPLING_EPS = 1e-5


@@ -246,11 +244,8 @@ class InputBatch:

        # req_index -> bad_words_token_ids
        self.bad_words_token_ids: dict[int, list[list[int]]] = {}
-        if vllm_version_is("0.9.2"):
-            self.logits_processing_needs_token_ids_bool = False
-        else:
-            self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
-                                                              dtype=bool)
+        self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
+                                                          dtype=bool)

        self.req_output_token_ids: list[Optional[list[int]]] = []

@@ -387,9 +382,6 @@ class InputBatch:
            if sampling_params.bad_words_token_ids:
                self.bad_words_token_ids[
                    req_index] = sampling_params.bad_words_token_ids
-        elif vllm_version_is("0.9.2"):
-            assert request.pooling_params is not None
-            self.pooling_params[req_id] = request.pooling_params
        elif pooling_params := request.pooling_params:
            self.pooling_params[req_id] = pooling_params
            self.logits_processing_needs_token_ids[req_index] = (
@@ -624,15 +616,10 @@ class InputBatch:
                       self.presence_penalties, num_reqs)
            copy_slice(self.repetition_penalties_cpu_tensor,
                       self.repetition_penalties, num_reqs)
-        if vllm_version_is("0.9.2"):
-            needs_prompt_token_ids = (
-                not self.no_penalties
-                or (self.num_reqs > 0
-                    and self.logits_processing_needs_token_ids_bool))
-        else:
-            needs_prompt_token_ids = (
-                not self.no_penalties
-                or self.logits_processing_needs_token_ids[:num_reqs].any())
+
+        needs_prompt_token_ids = (
+            not self.no_penalties
+            or self.logits_processing_needs_token_ids[:num_reqs].any())
        if needs_prompt_token_ids:
            # The prompt tokens are used only for applying penalties or
            # step pooling during the sampling/pooling process.
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -45,7 +45,7 @@ from vllm_ascend.utils import (sleep_mode_enabled, try_register_lib,
                               vllm_version_is)
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner

-if not vllm_version_is("0.9.2"):
+if not vllm_version_is("0.10.0"):
    from vllm.tasks import SupportedTask