Upgrade vLLM to v0.10.0 (#1927)

### What this PR does / why we need it? - Upgrade to v0.10.0 - Drop v0.9.2 version compatibility - Add patch for `vllm_ascend/patch/worker/patch_common/patch_sampler_gather_logprobs.py` as workaround of f3a683b7c9 for v0.10.0 and also add e2e test `test_models_prompt_logprobs` - Pin transformers<4.54.0 as workaround of https://github.com/vllm-project/vllm-ascend/issues/2034 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Test locally: `VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/test_offline_inference.py::test_models_prompt_logprobs` - CI passed - vLLM version: v0.9.2 - vLLM main: 7728dd77bb --------- Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
2025-07-26 15:43:29 +08:00
parent 2f50304c19
commit 17a430f7b8
29 changed files with 198 additions and 251 deletions
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -17,7 +17,7 @@

 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type

 import torch
 import torch_npu
@@ -31,7 +31,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch

 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
-                               nd_to_nz_2d, nd_to_nz_spec, vllm_version_is)
+                               nd_to_nz_2d, nd_to_nz_spec)


 class AscendAttentionBackend(AttentionBackend):
@@ -43,8 +43,6 @@ class AscendAttentionBackend(AttentionBackend):

    @staticmethod
    def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
-        if vllm_version_is("0.9.2"):
-            return AscendAttentionBackendImpl092
        return AscendAttentionBackendImpl

    @staticmethod
@@ -440,38 +438,6 @@ class AscendAttentionBackendImpl(AttentionImpl):
        return output.view(num_tokens, self.hidden_size)


-class AscendAttentionBackendImpl092(AscendAttentionBackendImpl):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        super().__init__(
-            num_heads=num_heads,
-            head_size=head_size,
-            scale=scale,
-            num_kv_heads=num_kv_heads,
-            alibi_slopes=alibi_slopes,
-            sliding_window=sliding_window,
-            kv_cache_dtype=kv_cache_dtype,
-            logits_soft_cap=logits_soft_cap,
-            attn_type=attn_type,
-            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
-            use_irope=use_irope,
-        )
-
-
 def unified_ascend_attention_with_output(
    query: torch.Tensor,
    key: torch.Tensor,
--- a/vllm_ascend/attention/attention_v1_torchair.py
+++ b/vllm_ascend/attention/attention_v1_torchair.py
@@ -16,7 +16,7 @@
 #

 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type

 import numpy as np
 import torch
@@ -29,7 +29,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch

 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
-                               nd_to_nz_2d, vllm_version_is)
+                               nd_to_nz_2d)


 class AscendAttentionTorchairBackend(AttentionBackend):
@@ -41,8 +41,6 @@ class AscendAttentionTorchairBackend(AttentionBackend):

    @staticmethod
    def get_impl_cls() -> Type["AscendAttentionTorchairBackendImpl"]:
-        if vllm_version_is("0.9.2"):
-            return AscendAttentionTorchairBackendImpl092
        return AscendAttentionTorchairBackendImpl

    @staticmethod
@@ -489,36 +487,3 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
                "to use ascend scheduler.")

        return output.view(num_tokens, self.hidden_size)
-
-
-class AscendAttentionTorchairBackendImpl092(AscendAttentionTorchairBackendImpl
-                                            ):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        super().__init__(
-            num_heads=num_heads,
-            head_size=head_size,
-            scale=scale,
-            num_kv_heads=num_kv_heads,
-            alibi_slopes=alibi_slopes,
-            sliding_window=sliding_window,
-            kv_cache_dtype=kv_cache_dtype,
-            logits_soft_cap=logits_soft_cap,
-            attn_type=attn_type,
-            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
-            use_irope=use_irope,
-        )
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -1,12 +1,11 @@
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
-                    TypeVar)
+from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar

 import numpy as np
 import torch
 import torch_npu
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
-                                              AttentionMetadata, AttentionType,
+                                              AttentionMetadata,
                                              MLAAttentionImpl)
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import get_current_vllm_config
@@ -22,7 +21,7 @@ from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
 from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
-from vllm_ascend.utils import npu_prefetch, vllm_version_is
+from vllm_ascend.utils import npu_prefetch
 from vllm_ascend.worker.npu_input_batch import InputBatch

 if TYPE_CHECKING:
@@ -54,8 +53,6 @@ class AscendMLABackend(AttentionBackend):

    @staticmethod
    def get_impl_cls() -> Type["MLAAttentionImpl"]:
-        if vllm_version_is("0.9.2"):
-            return AscendMLAImpl092
        return AscendMLAImpl


@@ -1212,34 +1209,3 @@ class AscendMLAImpl(MLAAttentionImpl):
                output[:num_decode_tokens] = output_decode

        return output_padded
-
-
-class AscendMLAImpl092(AscendMLAImpl):
-
-    def __init__(self,
-                 num_heads: int,
-                 head_size: int,
-                 scale: float,
-                 num_kv_heads: int,
-                 alibi_slopes: Optional[List[float]],
-                 sliding_window: Optional[int],
-                 kv_cache_dtype: str,
-                 blocksparse_params: Optional[Dict[str, Any]] = None,
-                 logits_soft_cap: Optional[float] = None,
-                 attn_type: str = AttentionType.DECODER,
-                 kv_sharing_target_layer_name: Optional[str] = None,
-                 use_irope: bool = False,
-                 **kwargs) -> None:
-        super().__init__(
-            num_heads=num_heads,
-            head_size=head_size,
-            scale=scale,
-            num_kv_heads=num_kv_heads,
-            alibi_slopes=alibi_slopes,
-            sliding_window=sliding_window,
-            kv_cache_dtype=kv_cache_dtype,
-            logits_soft_cap=logits_soft_cap,
-            attn_type=attn_type,
-            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
-            use_irope=use_irope,
-            **kwargs)
--- a/vllm_ascend/core/scheduler.py
+++ b/vllm_ascend/core/scheduler.py
@@ -32,8 +32,6 @@ from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager

-from vllm_ascend.utils import vllm_version_is
-

 class AscendScheduler(Scheduler):
    """This Scheduler extends vllm's original v1 scheduler
@@ -283,23 +281,12 @@ class AscendScheduler(Scheduler):
                    # allow the lower-priority requests to be scheduled.
                    req_index += 1
                    continue
-                if vllm_version_is("0.9.2"):
-                    num_draft_tokens = max(
-                        num_new_tokens + request.num_computed_tokens -
-                        request.num_tokens, 0)

                while True:
-                    if vllm_version_is("0.9.2"):
-                        new_blocks = self.kv_cache_manager.allocate_slots(
-                            request,
-                            num_new_tokens,
-                            num_draft_tokens=num_draft_tokens,
-                            num_lookahead_tokens=self.num_lookahead_tokens)
-                    else:
-                        new_blocks = self.kv_cache_manager.allocate_slots(
-                            request,
-                            num_new_tokens,
-                            num_lookahead_tokens=self.num_lookahead_tokens)
+                    new_blocks = self.kv_cache_manager.allocate_slots(
+                        request,
+                        num_new_tokens,
+                        num_lookahead_tokens=self.num_lookahead_tokens)
                    if new_blocks is None:
                        # The request cannot be scheduled.
                        # Preempt the lowest-priority request.
--- a/vllm_ascend/patch/init.py
+++ b/vllm_ascend/patch/init.py
@@ -24,9 +24,9 @@
 #           each worker's `__init__` function.
 #
 # Then in each kind of patch, there are three folders:
-# - patch_0_9_2: contains the patches applied when vllm version is 0.9.2.
+# - patch_0_10_0: contains the patches applied when vllm version is 0.10.0.
 # - patch_main: contains the patches applied when vllm version is main branch.
-# - patch_common: contains the patches applied in both 0.9.2 and main branch.
+# - patch_common: contains the patches applied in both 0.10.0 and main branch.
 #
 # Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
 # ----------------------------------------------------------------------------------
@@ -101,3 +101,16 @@
 #       - https://github.com/vllm-project/vllm-ascend/pull/1732
 #    Future Plan:
 #       Revert it when the ascend scatter performance improves.
+#
+# ** File: worker/patch_common/patch_sampler.py **
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.v1.sample.sampler.Sampler.gather_logprobs`
+#    Why:
+#       We need to patch gather_logprobs to make sure call batched_count_greater_than
+#       with backend=current_platform.simple_compile_backend
+#    How：
+#       Patch gather_logprobs call new batched_count_greater_than
+#    Related PR (if no, explain why):
+#       - https://github.com/vllm-project/vllm/pull/21591
+#    Future Plan:
+#       Revert it when vLLM merge #21591 and release new version
--- a/vllm_ascend/patch/platform/init.py
+++ b/vllm_ascend/patch/platform/init.py
@@ -17,8 +17,8 @@
 from vllm_ascend.utils import vllm_version_is

 # Import specific patches for different versions
-if vllm_version_is("0.9.2"):
-    from vllm_ascend.patch.platform import patch_0_9_2  # noqa: F401
+if vllm_version_is("0.10.0"):
+    from vllm_ascend.patch.platform import patch_0_10_0  # noqa: F401
    from vllm_ascend.patch.platform import patch_common  # noqa: F401
 else:
    from vllm_ascend.patch.platform import patch_common  # noqa: F401
--- a/vllm_ascend/patch/platform/patch_0_10_0/init.py
+++ b/vllm_ascend/patch/platform/patch_0_10_0/init.py
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -18,8 +18,8 @@
 from vllm_ascend.utils import vllm_version_is

 # Import specific patches for different versions
-if vllm_version_is("0.9.2"):
-    from vllm_ascend.patch.worker import patch_0_9_2  # noqa: F401
+if vllm_version_is("0.10.0"):
+    from vllm_ascend.patch.worker import patch_0_10_0  # noqa: F401
    from vllm_ascend.patch.worker import patch_common  # noqa: F401
 else:
    from vllm_ascend.patch.worker import patch_common  # noqa: F401
--- a/vllm_ascend/patch/worker/patch_0_10_0/init.py
+++ b/vllm_ascend/patch/worker/patch_0_10_0/init.py
@@ -14,3 +14,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+import vllm_ascend.patch.worker.patch_0_10_0.patch_sampler_gather_logprobs  # noqa
--- a/vllm_ascend/patch/worker/patch_0_10_0/patch_sampler_gather_logprobs.py
+++ b/vllm_ascend/patch/worker/patch_0_10_0/patch_sampler_gather_logprobs.py
@@ -0,0 +1,87 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import torch
+from vllm.platforms import current_platform
+from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.sample.sampler import Sampler
+
+
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
+def batched_count_greater_than(x: torch.Tensor,
+                               values: torch.Tensor) -> torch.Tensor:
+    """
+    Counts elements in each row of x that are greater than the corresponding
+    value in values.  Use torch.compile to generate an optimized kernel for
+    this function. otherwise, it will create additional copies of the input
+    tensors and cause memory issues.
+    Args:
+        x (torch.Tensor): A 2D tensor of shape (batch_size, n_elements).
+        values (torch.Tensor): A 2D tensor of shape (batch_size, 1).
+    Returns:
+        torch.Tensor: A 1D tensor of shape (batch_size,) with the counts.
+    """
+    return (x >= values).sum(-1)
+
+
+def gather_logprobs(
+    self,
+    logprobs: torch.Tensor,
+    num_logprobs: int,
+    token_ids: torch.Tensor,
+) -> LogprobsTensors:
+    """
+    Gather logprobs for topk and sampled/prompt token.
+
+    Args:
+        logprobs: (num tokens) x (vocab) tensor
+        num_logprobs: minimum number of logprobs to
+                    retain per token
+        token_ids: prompt tokens (if prompt logprobs)
+                    or sampled tokens (if sampled
+                    logprobs); 1D token ID tensor
+                    with (num tokens) elements
+                    Must be int64.
+
+    Returns:
+        Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
+        Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
+        Sampled token rank tensor, (num tokens)
+    """
+    assert token_ids.dtype == torch.int64
+    # Find the topK values.
+    topk_logprobs, topk_indices = torch.topk(logprobs, num_logprobs, dim=-1)
+
+    # Get with the logprob of the prompt or sampled token.
+    token_ids = token_ids.unsqueeze(-1)
+    token_logprobs = logprobs.gather(-1, token_ids)
+
+    # Compute the ranks of the actual token.
+    token_ranks = batched_count_greater_than(logprobs, token_logprobs)
+
+    # Concatenate together with the topk.
+    indices = torch.cat((token_ids, topk_indices), dim=1)
+    logprobs = torch.cat((token_logprobs, topk_logprobs), dim=1)
+
+    # Use int32 to reduce the tensor size.
+    indices = indices.to(torch.int32)
+
+    return LogprobsTensors(indices, logprobs, token_ranks)
+
+
+Sampler.gather_logprobs = gather_logprobs
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -45,8 +45,9 @@ from vllm.logger import logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
-from vllm.model_executor.models.interfaces_base import (VllmModelForPooling,
-                                                        is_pooling_model)
+from vllm.model_executor.models.interfaces import supports_transcription
+from vllm.model_executor.models.interfaces_base import (
+    VllmModelForPooling, is_pooling_model, is_text_generation_model)
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
@@ -66,7 +67,7 @@ from vllm.v1.sample.sampler import Sampler
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
-from vllm.v1.worker.utils import (gather_mm_placeholders,
+from vllm.v1.worker.utils import (bind_kv_cache, gather_mm_placeholders,
                                  sanity_check_mm_encoder_outputs,
                                  scatter_mm_placeholders)

@@ -88,15 +89,8 @@ from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
 from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
 from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch

-if vllm_version_is("0.9.2"):
-    from vllm.model_executor.models.interfaces import has_step_pooler
-    from vllm.v1.utils import bind_kv_cache
-else:
-    from vllm.model_executor.models.interfaces import supports_transcription
-    from vllm.model_executor.models.interfaces_base import \
-        is_text_generation_model
+if not vllm_version_is("0.10.0"):
    from vllm.tasks import GenerationTask, SupportedTask
-    from vllm.v1.worker.utils import bind_kv_cache

 if TYPE_CHECKING:
    import xgrammar as xgr  # type: ignore[import-untyped]
@@ -409,7 +403,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            else:
                generator = None

-            if not vllm_version_is("0.9.2") and pooling_params:
+            if pooling_params:
                assert (task := pooling_params.task) is not None, (
                    "You did not set `task` in the API")
                model = cast(VllmModelForPooling, self.model)
@@ -585,10 +579,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):

        # OPTIMIZATION: Start copying the block table first.
        # This way, we can overlap the copy with the following CPU operations.
-        if vllm_version_is("0.9.2"):
-            self.input_batch.block_table.commit(num_reqs)
-        else:
-            self.input_batch.block_table.commit_block_table(num_reqs)
+        self.input_batch.block_table.commit_block_table(num_reqs)

        # Get the number of scheduled tokens for each request.
        req_ids = self.input_batch.req_ids
@@ -939,10 +930,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):

        # OPTIMIZATION: Start copying the block table first.
        # This way, we can overlap the copy with the following CPU operations.
-        if vllm_version_is("0.9.2"):
-            self.input_batch.block_table.commit(num_reqs)
-        else:
-            self.input_batch.block_table.commit_block_table(num_reqs)
+        self.input_batch.block_table.commit_block_table(num_reqs)

        # Get the number of scheduled tokens for each request.
        # TODO: The Python loop can be slow. Optimize.
@@ -1771,57 +1759,33 @@ class NPUModelRunner(LoRAModelRunnerMixin):

        req_num_tokens = num_tokens // num_reqs

-        if vllm_version_is("0.9.2"):
-            dummy_metadata = PoolingMetadata(
-                prompt_lens=torch.tensor(
-                    [h.shape[0] for h in hidden_states_list],
-                    device=self.device),
-                prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
-                                             dtype=torch.int32,
-                                             device=self.device),
-                pooling_params=[PoolingParams()] * num_reqs)
-            try:
-                pooler_output = self.model.pooler(
-                    hidden_states=hidden_states_list,
-                    pooling_metadata=dummy_metadata)
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    raise RuntimeError(
-                        "NPU out of memory occurred when warming up pooler with "
-                        f"{num_reqs} dummy requests. Please try lowering "
-                        "`max_num_seqs` or `gpu_memory_utilization` when "
-                        "initializing the engine.") from e
-                else:
-                    raise e
-        else:
-            model = cast(VllmModelForPooling, self.model)
-            dummy_task = self.get_supported_pooling_tasks()[0]
-            dummy_pooling_params = PoolingParams(task=dummy_task)
+        model = cast(VllmModelForPooling, self.model)
+        dummy_task = self.get_supported_pooling_tasks()[0]
+        dummy_pooling_params = PoolingParams(task=dummy_task)

-            to_update = model.pooler.get_pooling_updates(dummy_task)
-            to_update.apply(dummy_pooling_params)
+        to_update = model.pooler.get_pooling_updates(dummy_task)
+        to_update.apply(dummy_pooling_params)

-            dummy_metadata = PoolingMetadata(
-                prompt_lens=torch.tensor(
-                    [h.shape[0] for h in hidden_states_list],
-                    device=self.device),
-                prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
-                                             dtype=torch.int32,
-                                             device=self.device),
-                pooling_params=[dummy_pooling_params] * num_reqs)
+        dummy_metadata = PoolingMetadata(
+            prompt_lens=torch.tensor([h.shape[0] for h in hidden_states_list],
+                                     device=self.device),
+            prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
+                                         dtype=torch.int32,
+                                         device=self.device),
+            pooling_params=[dummy_pooling_params] * num_reqs)

-            try:
-                pooler_output = model.pooler(hidden_states=hidden_states_list,
-                                             pooling_metadata=dummy_metadata)
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    raise RuntimeError(
-                        "NPU out of memory occurred when warming up pooler with "
-                        f"{num_reqs} dummy requests. Please try lowering "
-                        "`max_num_seqs` or `gpu_memory_utilization` when "
-                        "initializing the engine.") from e
-                else:
-                    raise e
+        try:
+            pooler_output = model.pooler(hidden_states=hidden_states_list,
+                                         pooling_metadata=dummy_metadata)
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                raise RuntimeError(
+                    "NPU out of memory occurred when warming up pooler with "
+                    f"{num_reqs} dummy requests. Please try lowering "
+                    "`max_num_seqs` or `gpu_memory_utilization` when "
+                    "initializing the engine.") from e
+            else:
+                raise e

        return pooler_output

@@ -1841,9 +1805,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                                   QKVParallelLinear, RowParallelLinear)):
                        module.weight.data = torch_npu.npu_format_cast(
                            module.weight.data, ACL_FORMAT_FRACTAL_NZ)
-
-            if vllm_version_is("0.9.2") and has_step_pooler(self.model):
-                self.input_batch.logits_processing_needs_token_ids_bool = True
            if self.drafter:
                logger.info("Loading drafter model...")
                if isinstance(self.drafter, EagleProposer):
--- a/vllm_ascend/worker/npu_input_batch.py
+++ b/vllm_ascend/worker/npu_input_batch.py
@@ -35,8 +35,6 @@ from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable

-from vllm_ascend.utils import vllm_version_is
-
 _SAMPLING_EPS = 1e-5


@@ -246,11 +244,8 @@ class InputBatch:

        # req_index -> bad_words_token_ids
        self.bad_words_token_ids: dict[int, list[list[int]]] = {}
-        if vllm_version_is("0.9.2"):
-            self.logits_processing_needs_token_ids_bool = False
-        else:
-            self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
-                                                              dtype=bool)
+        self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
+                                                          dtype=bool)

        self.req_output_token_ids: list[Optional[list[int]]] = []

@@ -387,9 +382,6 @@ class InputBatch:
            if sampling_params.bad_words_token_ids:
                self.bad_words_token_ids[
                    req_index] = sampling_params.bad_words_token_ids
-        elif vllm_version_is("0.9.2"):
-            assert request.pooling_params is not None
-            self.pooling_params[req_id] = request.pooling_params
        elif pooling_params := request.pooling_params:
            self.pooling_params[req_id] = pooling_params
            self.logits_processing_needs_token_ids[req_index] = (
@@ -624,15 +616,10 @@ class InputBatch:
                       self.presence_penalties, num_reqs)
            copy_slice(self.repetition_penalties_cpu_tensor,
                       self.repetition_penalties, num_reqs)
-        if vllm_version_is("0.9.2"):
-            needs_prompt_token_ids = (
-                not self.no_penalties
-                or (self.num_reqs > 0
-                    and self.logits_processing_needs_token_ids_bool))
-        else:
-            needs_prompt_token_ids = (
-                not self.no_penalties
-                or self.logits_processing_needs_token_ids[:num_reqs].any())
+
+        needs_prompt_token_ids = (
+            not self.no_penalties
+            or self.logits_processing_needs_token_ids[:num_reqs].any())
        if needs_prompt_token_ids:
            # The prompt tokens are used only for applying penalties or
            # step pooling during the sampling/pooling process.
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -45,7 +45,7 @@ from vllm_ascend.utils import (sleep_mode_enabled, try_register_lib,
                               vllm_version_is)
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner

-if not vllm_version_is("0.9.2"):
+if not vllm_version_is("0.10.0"):
    from vllm.tasks import SupportedTask