Clean up v0.9.1 code (#1672)

vllm has released 0.9.2. This PR drop 0.9.1 support. - vLLM version: v0.9.1 - vLLM main: b942c094e3 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-07-09 08:52:24 +08:00
parent 0d4bc03946
commit 830332ebfc
23 changed files with 205 additions and 846 deletions
--- a/vllm_ascend/core/scheduler.py
+++ b/vllm_ascend/core/scheduler.py
@@ -32,8 +32,6 @@ from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager

-from vllm_ascend.utils import vllm_version_is
-

 class AscendScheduler(Scheduler):
    """This Scheduler extends vllm's original v1 scheduler
@@ -366,32 +364,12 @@ class AscendScheduler(Scheduler):
                                        req_to_new_block_ids[req.request_id])
            for req in scheduled_new_reqs
        ]
-        if vllm_version_is("0.9.1"):
-            resumed_reqs_data = [
-                self._make_cached_request_data(
-                    req,
-                    num_scheduled_tokens[req.request_id],
-                    len(scheduled_spec_decode_tokens.get(req.request_id, ())),
-                    req_to_new_block_ids[req.request_id],
-                    resumed_from_preemption=True,
-                ) for req in scheduled_resumed_reqs
-            ]
-            running_reqs_data = [
-                self._make_cached_request_data(
-                    req,
-                    num_scheduled_tokens[req.request_id],
-                    len(scheduled_spec_decode_tokens.get(req.request_id, ())),
-                    req_to_new_block_ids[req.request_id],
-                    resumed_from_preemption=False,
-                ) for req in scheduled_running_reqs
-            ]
-            scheduled_cached_reqs = resumed_reqs_data + running_reqs_data
-        else:
-            cached_reqs_data = self._make_cached_request_data(
-                scheduled_running_reqs, scheduled_resumed_reqs,
-                num_scheduled_tokens, scheduled_spec_decode_tokens,
-                req_to_new_block_ids)
-            scheduled_cached_reqs = cached_reqs_data
+
+        cached_reqs_data = self._make_cached_request_data(
+            scheduled_running_reqs, scheduled_resumed_reqs,
+            num_scheduled_tokens, scheduled_spec_decode_tokens,
+            req_to_new_block_ids)
+        scheduled_cached_reqs = cached_reqs_data

        scheduler_output = SchedulerOutput(
            scheduled_new_reqs=new_reqs_data,
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -50,10 +50,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
    # value is None, which means the system default C compiler will be used.
    "C_COMPILER":
    lambda: os.getenv("C_COMPILER", None),
-    # Whether to enable the topk optimization. It's disabled by default for experimental support
-    # We'll make it enabled by default in the future.
-    "VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE":
-    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE", '0'))),
    # The version of the Ascend chip. If not set, the default value is
    # ASCEND910B1. It's used for package building. Please make sure that the
    # version is correct.
--- a/vllm_ascend/models/deepseek_dbo.py
+++ b/vllm_ascend/models/deepseek_dbo.py
@@ -78,7 +78,7 @@ from vllm_ascend.multistream.metadata import (MultiStreamConfig,
                                              make_multistream_metadata_ds)
 from vllm_ascend.multistream.ms_split import compute_split_seq_index
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
-from vllm_ascend.utils import dispose_tensor, vllm_version_is
+from vllm_ascend.utils import dispose_tensor

 VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO

@@ -1032,19 +1032,12 @@ class CustomDeepseekDBOForCausalLM(DeepseekV2ForCausalLM):

                    param = params_dict[name]
                    weight_loader = param.weight_loader
-                    if vllm_version_is("0.9.1"):
-                        weight_loader(param,
-                                      loaded_weight,
-                                      name,
-                                      shard_id=shard_id,
-                                      expert_id=expert_id)
-                    else:
-                        weight_loader(param,
-                                      loaded_weight,
-                                      name,
-                                      shard_id=shard_id,
-                                      expert_id=expert_id,
-                                      return_success=False)
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id,
+                                  return_success=False)
                    break
                else:
                    # Skip loading extra bias for GPTQ models.
--- a/vllm_ascend/models/deepseek_v2.py
+++ b/vllm_ascend/models/deepseek_v2.py
@@ -75,7 +75,7 @@ from vllm_ascend.ops.fused_moe import AscendFusedMoE
 from vllm_ascend.quantization.quant_config import AscendLinearMethod
 from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
 from vllm_ascend.utils import (dispose_tensor, npu_stream_switch,
-                               npu_wait_tensor, vllm_version_is)
+                               npu_wait_tensor)


 class CustomDeepseekV2SiluAndMul(SiluAndMul):
@@ -936,19 +936,12 @@ class CustomDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):

                    param = params_dict[name]
                    weight_loader = param.weight_loader
-                    if vllm_version_is("0.9.1"):
-                        weight_loader(param,
-                                      loaded_weight,
-                                      name,
-                                      shard_id=shard_id,
-                                      expert_id=expert_id)
-                    else:
-                        weight_loader(param,
-                                      loaded_weight,
-                                      name,
-                                      shard_id=shard_id,
-                                      expert_id=expert_id,
-                                      return_success=False)
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id,
+                                  return_success=False)
                    break
                else:
                    # Skip loading extra bias for GPTQ models.
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -28,6 +28,10 @@ from vllm.distributed import (GroupCoordinator, get_tensor_model_parallel_rank,
                              tensor_model_parallel_all_reduce)
 from vllm.distributed.parallel_state import get_dp_group, get_tp_group
 from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.fused_moe.config import \
+    FusedMoEConfig  # isort: skip
+from vllm.model_executor.layers.fused_moe.config import \
+    FusedMoEParallelConfig  # isort: skip
 from vllm.model_executor.layers.fused_moe.layer import (
    FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
 from vllm.model_executor.layers.quantization.base_config import \
@@ -39,16 +43,7 @@ from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
 from vllm_ascend.utils import (FusedMoEState, dispose_tensor,
                               get_fused_moe_state, is_310p, npu_stream_switch,
-                               npu_wait_tensor, vllm_version_is)
-
-if vllm_version_is("0.9.1"):
-    from vllm.model_executor.layers.fused_moe.layer import \
-        FusedMoEParallelConfig
-    from vllm.model_executor.layers.fused_moe.layer import \
-        MoEConfig as FusedMoEConfig
-else:
-    from vllm.model_executor.layers.fused_moe.config import (
-        FusedMoEConfig, FusedMoEParallelConfig)
+                               npu_wait_tensor)

 MOE_ALL2ALL_BUFFER: bool = envs_ascend.MOE_ALL2ALL_BUFFER

@@ -1177,27 +1172,15 @@ class AscendFusedMoE(FusedMoE):
        if self.scoring_func != "softmax" and not self.use_grouped_topk:
            raise ValueError("Only softmax scoring function is supported for "
                             "non-grouped topk.")
-
-        if vllm_version_is("0.9.1"):
-            moe = FusedMoEConfig(
-                num_experts=self.global_num_experts,
-                experts_per_token=top_k,
-                hidden_dim=hidden_size,
-                num_local_experts=self.local_num_experts,
-                moe_parallel_config=self.moe_parallel_config,
-                # TODO (bnell): this needs to be fixed for quantized types.
-                in_dtype=params_dtype,
-            )
-        else:
-            moe = FusedMoEConfig.make(
-                num_experts=self.global_num_experts,
-                experts_per_token=top_k,
-                hidden_dim=hidden_size,
-                num_local_experts=self.local_num_experts,
-                moe_parallel_config=self.moe_parallel_config,
-                # TODO (bnell): this needs to be fixed for quantized types.
-                in_dtype=params_dtype,
-                quant_config=quant_config)
+        moe = FusedMoEConfig.make(
+            num_experts=self.global_num_experts,
+            experts_per_token=top_k,
+            hidden_dim=hidden_size,
+            num_local_experts=self.local_num_experts,
+            moe_parallel_config=self.moe_parallel_config,
+            # TODO (bnell): this needs to be fixed for quantized types.
+            in_dtype=params_dtype,
+            quant_config=quant_config)

        if quant_config is None:
            self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
--- a/vllm_ascend/patch/init.py
+++ b/vllm_ascend/patch/init.py
@@ -24,9 +24,9 @@
 #           each worker's `__init__` function.
 #
 # Then in each kind of patch, there are three folders:
-# - patch_0_9_1: contains the patches applied when vllm version is 0.9.1.
+# - patch_0_9_2: contains the patches applied when vllm version is 0.9.2.
 # - patch_main: contains the patches applied when vllm version is main branch.
-# - patch_common: contains the patches applied in both 0.9.1 and main branch.
+# - patch_common: contains the patches applied in both 0.9.2 and main branch.
 #
 # Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
 # ----------------------------------------------------------------------------------
@@ -105,32 +105,6 @@
 #    Future Plan:
 #       Revert it when the related pr is merged in vllm and vllm-ascend.
 #
-# ** File: worker/patch_common/patch_sampler.py **
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. `vllm.v1.sample.sampler.Sampler.apply_top_k_top_p`
-#    Why:
-#       We need to use the patched `apply_top_k_top_p` in `sample`.
-#       The mainly reason to overwrite `apply_top_k_top_p` is
-#       to improve performance.
-#    How：
-#       Re-implementation the `apply_top_k_top_p` function by pytorch
-#    Related PR (if no, explain why):
-#       - https://github.com/vllm-project/vllm-ascend/pull/970
-#    Future Plan:
-#       Revert it when the ascend scatter performance improves.
-#
-#   2. `vllm.v1.sample.sampler.Sampler.apply_min_p`
-#    Why:
-#       We need to use the patched `apply_min_p` in `sample`.
-#       The mainly reason to overwrite `apply_min_p` is
-#       to improve performance.
-#    How：
-#       Re-implementation the `apply_min_p` function by pytorch
-#    Related PR (if no, explain why):
-#       - https://github.com/vllm-project/vllm-ascend/pull/970
-#    Future Plan:
-#       Revert it when the ascend indexput performance improves.
-#
 # ** File: worker/patch_common/patch_distributed.py **
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.distributed.parallel_state.GroupCoordinator`
@@ -154,4 +128,4 @@
 #    Related PR (if no, explain why):
 #       This is the problem in vllm-ascend
 #    Future Plan:
-#       Remove this patch once pytorch 2.7.0 is supported for vllm ascend.
+#       Remove this patch once pytorch 2.7.0 is supported for vllm ascend.
--- a/vllm_ascend/patch/platform/init.py
+++ b/vllm_ascend/patch/platform/init.py
@@ -17,8 +17,8 @@
 from vllm_ascend.utils import vllm_version_is

 # Import specific patches for different versions
-if vllm_version_is("0.9.1"):
-    from vllm_ascend.patch.platform import patch_0_9_1  # noqa: F401
+if vllm_version_is("0.9.2"):
+    from vllm_ascend.patch.platform import patch_0_9_2  # noqa: F401
    from vllm_ascend.patch.platform import patch_common  # noqa: F401
 else:
    from vllm_ascend.patch.platform import patch_common  # noqa: F401
--- a/vllm_ascend/patch/platform/patch_0_9_2/init.py
+++ b/vllm_ascend/patch/platform/patch_0_9_2/init.py
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -18,8 +18,8 @@
 from vllm_ascend.utils import vllm_version_is

 # Import specific patches for different versions
-if vllm_version_is("0.9.1"):
-    from vllm_ascend.patch.worker import patch_0_9_1  # noqa: F401
+if vllm_version_is("0.9.2"):
+    from vllm_ascend.patch.worker import patch_0_9_2  # noqa: F401
    from vllm_ascend.patch.worker import patch_common  # noqa: F401
 else:
    from vllm_ascend.patch.worker import patch_common  # noqa: F401
--- a/vllm_ascend/patch/worker/patch_0_9_1/patch_sampler.py
+++ b/vllm_ascend/patch/worker/patch_0_9_1/patch_sampler.py
@@ -1,106 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# SPDX-License-Identifier: Apache-2.0
-# This file is a part of the vllm-ascend project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from typing import Optional
-
-import torch
-import torch_npu
-from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
-from vllm.v1.sample.sampler import Sampler
-
-from vllm_ascend import envs
-
-
-def apply_min_p(
-    self,
-    logits: torch.Tensor,
-    min_p: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Filters logits using adaptive probability thresholding.
-    """
-    # Convert logits to probability distribution
-    probability_values = torch.nn.functional.softmax(logits, dim=-1)
-    # Calculate maximum probabilities per sequence
-    max_probabilities = torch.amax(probability_values, dim=-1, keepdim=True)
-    # Reshape min_p for broadcasting
-    adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
-    # Identify valid tokens using threshold comparison
-    # Apply mask using boolean indexing
-    logits = logits.masked_fill(probability_values < adjusted_min_p,
-                                -float('inf'))
-    return logits
-
-
-def _apply_top_k_top_p(
-    logits: torch.Tensor,
-    k: torch.Tensor,
-    p: torch.Tensor,
-) -> torch.Tensor:
-    if p is not None and k is not None:
-        # npu_top_k_top_p's parameter order is (logits, p, k), not (logits, k, p)
-        return torch_npu.npu_top_k_top_p(logits, p, k)
-
-    probs = logits.softmax(dim=-1)
-    probs_sort, _ = probs.sort(dim=-1, descending=False)
-
-    if k is not None:
-        top_k_count = probs_sort.size(1) - k.to(torch.long)  # shape: (batch, )
-        top_k_count = top_k_count.unsqueeze(dim=1)
-        top_k_cutoff = probs_sort.gather(-1, top_k_count)
-
-        # Make sure the no top-k rows are no-op.
-        no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
-        top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))
-
-        elements_to_discard = probs < top_k_cutoff
-        logits.masked_fill_(elements_to_discard, -float("inf"))
-
-    if p is not None:
-        cumprob = torch.cumsum(probs_sort, dim=-1)
-        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
-        top_p_mask[:, -1] = False  # at least one
-
-        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
-        top_p_cutoff = probs_sort.gather(-1, top_p_count)
-        elements_to_discard = probs < top_p_cutoff
-        logits.masked_fill_(elements_to_discard, -float("inf"))
-
-    return logits
-
-
-def topk_topp_forward_native(
-    self,
-    logits: torch.Tensor,
-    generators: dict[int, torch.Generator],
-    k: Optional[torch.Tensor],
-    p: Optional[torch.Tensor],
-) -> torch.Tensor:
-    """
-    PyTorch-native implementation of top-k and top-p sampling.
-
-    The logits tensor may be updated in-place.
-    """
-    logits = _apply_top_k_top_p(logits, k, p)
-    probs = logits.softmax(dim=-1, dtype=torch.float32)
-    return random_sample(probs, generators)
-
-
-Sampler.apply_min_p = apply_min_p
-if envs.VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE:
-    TopKTopPSampler.forward_native = topk_topp_forward_native
--- a/vllm_ascend/patch/worker/patch_0_9_2/init.py
+++ b/vllm_ascend/patch/worker/patch_0_9_2/init.py
@@ -14,4 +14,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import vllm_ascend.patch.worker.patch_0_9_1.patch_sampler  # noqa
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -44,6 +44,7 @@ from vllm.logger import logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models.interfaces import has_step_pooler
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
@@ -79,7 +80,7 @@ from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
                               ProfileExecuteDuration,
                               check_torchair_cache_exist, is_310p,
                               maybe_converting_weight_acl_format,
-                               vllm_version_is, write_kv_cache_bytes_to_file)
+                               write_kv_cache_bytes_to_file)
 from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
 from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
 from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
@@ -95,9 +96,6 @@ import vllm.envs as envs_vllm

 import vllm_ascend.envs as envs_ascend

-if vllm_version_is("0.9.1"):
-    from vllm.v1.spec_decode.utils import is_spec_decode_supported
-
 if is_310p():
    torch_npu.npu.set_compile_mode(jit_compile=False)

@@ -408,16 +406,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            else:
                generator = None

-            # For vllm v0.9.1 version compatibility, we check if
-            # `pooling_params` is present in the new request data.
-            pooling_params = getattr(new_req_data, "pooling_params", None)
            self.requests[req_id] = CachedRequestState(
                req_id=req_id,
                prompt_token_ids=new_req_data.prompt_token_ids,
                mm_inputs=new_req_data.mm_inputs,
                mm_positions=new_req_data.mm_positions,
                sampling_params=sampling_params,
-                pooling_params=pooling_params,
+                pooling_params=new_req_data.pooling_params,
                generator=generator,
                block_ids=new_req_data.block_ids,
                num_computed_tokens=new_req_data.num_computed_tokens,
@@ -465,62 +460,59 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            req_ids_to_add.append(req_id)

        # Update the states of the running/resumed requests.
-        if vllm_version_is("0.9.1"):
-            for req_data in scheduler_output.scheduled_cached_reqs:
-                req_id = req_data.req_id
-                req_state = self.requests[req_id]
+        req_data = scheduler_output.scheduled_cached_reqs
+        is_last_rank = get_pp_group().is_last_rank
+        for i, req_id in enumerate(req_data.req_ids):
+            req_state = self.requests[req_id]
+            num_computed_tokens = req_data.num_computed_tokens[i]
+            new_block_ids = req_data.new_block_ids[i]
+            resumed_from_preemption = req_data.resumed_from_preemption[i]

-                # Update the cached states.
-                num_computed_tokens = req_data.num_computed_tokens
-                req_state.num_computed_tokens = num_computed_tokens
+            req_state.num_computed_tokens = num_computed_tokens
+            if not is_last_rank:
+                new_token_ids = req_data.new_token_ids[i]
                # Add the sampled token(s) from the previous step (if any).
                # This doesn't include "unverified" tokens like spec decode tokens.
-                num_new_tokens = (num_computed_tokens +
-                                  len(req_data.new_token_ids) -
+                num_new_tokens = (num_computed_tokens + len(new_token_ids) -
                                  req_state.num_tokens)
                if num_new_tokens == 1:
                    # Avoid slicing list in most common case.
-                    req_state.output_token_ids.append(
-                        req_data.new_token_ids[-1])
+                    req_state.output_token_ids.append(new_token_ids[-1])
                elif num_new_tokens > 0:
                    req_state.output_token_ids.extend(
-                        req_data.new_token_ids[-num_new_tokens:])
-                # Update the block IDs.
-                if not req_data.resumed_from_preemption:
-                    # Append the new blocks to the existing block IDs.
-                    for block_ids, new_block_ids in zip(  # type: ignore[call-overload]
-                            req_state.block_ids,
-                            req_data.new_block_ids,
-                            strict=True):
-                        block_ids.extend(new_block_ids)
-                else:
-                    # The request is resumed from preemption.
-                    # Replace the existing block IDs with the new ones.
-                    req_state.block_ids = req_data.new_block_ids
+                        new_token_ids[-num_new_tokens:])
+            # Update the block IDs.
+            if not resumed_from_preemption:
+                # Append the new blocks to the existing block IDs.
+                for block_ids, new_ids in zip(  # type: ignore[call-overload]
+                        req_state.block_ids, new_block_ids):
+                    block_ids.extend(new_ids)
+            else:
+                # The request is resumed from preemption.
+                # Replace the existing block IDs with the new ones.
+                req_state.block_ids = new_block_ids

-                req_index = self.input_batch.req_id_to_index.get(req_id)
-                if req_index is None:
-                    # The request is not in the persistent batch.
-                    # The request was either preempted and resumed later, or was not
-                    # scheduled in the previous step and needs to be added again.
-                    req_ids_to_add.append(req_id)
-                    continue
+            req_index = self.input_batch.req_id_to_index.get(req_id)
+            if req_index is None:
+                # The request is not in the persistent batch.
+                # The request was either preempted and resumed later, or was not
+                # scheduled in the previous step and needs to be added again.
+                req_ids_to_add.append(req_id)
+                continue

-                # Update the persistent batch.
-                self.input_batch.num_computed_tokens_cpu[req_index] = (
-                    num_computed_tokens)
+            # Update the persistent batch.
+            self.input_batch.num_computed_tokens_cpu[req_index] = (
+                num_computed_tokens)

-                start_index = (len(req_state.block_ids) -
-                               len(req_data.new_block_ids))
-                self.input_batch.block_table.append_row(
-                    req_data.new_block_ids, req_index)
+            self.input_batch.block_table.append_row(new_block_ids, req_index)
+
+            if not is_last_rank:
                # Add new_token_ids to token_ids_cpu.
                start_token_index = num_computed_tokens
-                end_token_index = num_computed_tokens + len(
-                    req_data.new_token_ids)
+                end_token_index = num_computed_tokens + len(new_token_ids)
                self.input_batch.token_ids_cpu[
                    req_index,
-                    start_token_index:end_token_index] = req_data.new_token_ids
+                    start_token_index:end_token_index] = new_token_ids
                self.input_batch.num_tokens_no_spec[
                    req_index] = end_token_index
                # Add spec_token_ids to token_ids_cpu.
@@ -534,75 +526,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                        start_index:end_token_index] = spec_token_ids
                # NOTE(woosuk): `num_tokens` here may include spec decode tokens.
                self.input_batch.num_tokens[req_index] = end_token_index
-        else:
-            req_data = scheduler_output.scheduled_cached_reqs
-            is_last_rank = get_pp_group().is_last_rank
-            for i, req_id in enumerate(req_data.req_ids):
-                req_state = self.requests[req_id]
-                num_computed_tokens = req_data.num_computed_tokens[i]
-                new_block_ids = req_data.new_block_ids[i]
-                resumed_from_preemption = req_data.resumed_from_preemption[i]
-
-                req_state.num_computed_tokens = num_computed_tokens
-                if not is_last_rank:
-                    new_token_ids = req_data.new_token_ids[i]
-                    # Add the sampled token(s) from the previous step (if any).
-                    # This doesn't include "unverified" tokens like spec decode tokens.
-                    num_new_tokens = (num_computed_tokens +
-                                      len(new_token_ids) -
-                                      req_state.num_tokens)
-                    if num_new_tokens == 1:
-                        # Avoid slicing list in most common case.
-                        req_state.output_token_ids.append(new_token_ids[-1])
-                    elif num_new_tokens > 0:
-                        req_state.output_token_ids.extend(
-                            new_token_ids[-num_new_tokens:])
-                # Update the block IDs.
-                if not resumed_from_preemption:
-                    # Append the new blocks to the existing block IDs.
-                    for block_ids, new_ids in zip(  # type: ignore[call-overload]
-                            req_state.block_ids, new_block_ids):
-                        block_ids.extend(new_ids)
-                else:
-                    # The request is resumed from preemption.
-                    # Replace the existing block IDs with the new ones.
-                    req_state.block_ids = new_block_ids
-
-                req_index = self.input_batch.req_id_to_index.get(req_id)
-                if req_index is None:
-                    # The request is not in the persistent batch.
-                    # The request was either preempted and resumed later, or was not
-                    # scheduled in the previous step and needs to be added again.
-                    req_ids_to_add.append(req_id)
-                    continue
-
-                # Update the persistent batch.
-                self.input_batch.num_computed_tokens_cpu[req_index] = (
-                    num_computed_tokens)
-
-                self.input_batch.block_table.append_row(
-                    new_block_ids, req_index)
-
-                if not is_last_rank:
-                    # Add new_token_ids to token_ids_cpu.
-                    start_token_index = num_computed_tokens
-                    end_token_index = num_computed_tokens + len(new_token_ids)
-                    self.input_batch.token_ids_cpu[
-                        req_index,
-                        start_token_index:end_token_index] = new_token_ids
-                    self.input_batch.num_tokens_no_spec[
-                        req_index] = end_token_index
-                    # Add spec_token_ids to token_ids_cpu.
-                    spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
-                        req_id, ())
-                    if spec_token_ids:
-                        start_index = end_token_index
-                        end_token_index += len(spec_token_ids)
-                        self.input_batch.token_ids_cpu[
-                            req_index,
-                            start_index:end_token_index] = spec_token_ids
-                    # NOTE(woosuk): `num_tokens` here may include spec decode tokens.
-                    self.input_batch.num_tokens[req_index] = end_token_index

        # Check if the batch has changed. If not, we can skip copying the
        # sampling metadata from CPU to GPU.
@@ -835,25 +758,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                # compute completion's mrope_positions on-the-fly
                dst_start = mrope_pos_ptr
                dst_end = mrope_pos_ptr + completion_part_len
-
-                if vllm_version_is("0.9.1"):
-                    self.mrope_positions_cpu[:, dst_start:dst_end] = \
-                        MRotaryEmbedding.get_next_input_positions_tensor(
-                            req.mrope_position_delta,
-                            context_len=num_computed_tokens +
-                            prompt_part_len,
-                            seq_len=num_computed_tokens +
-                            prompt_part_len +
-                            completion_part_len,
-                        )
-                else:
-                    MRotaryEmbedding.get_next_input_positions_tensor(
-                        out=self.mrope_positions_np,
-                        out_offset=dst_start,
-                        mrope_position_delta=req.mrope_position_delta,
-                        context_len=num_computed_tokens + prompt_part_len,
-                        num_new_tokens=completion_part_len,
-                    )
+                MRotaryEmbedding.get_next_input_positions_tensor(
+                    out=self.mrope_positions_np,
+                    out_offset=dst_start,
+                    mrope_position_delta=req.mrope_position_delta,
+                    context_len=num_computed_tokens + prompt_part_len,
+                    num_new_tokens=completion_part_len,
+                )

                mrope_pos_ptr += completion_part_len

@@ -1661,30 +1572,29 @@ class NPUModelRunner(LoRAModelRunnerMixin):

            for i in discard_sampled_tokens_req_indices:
                valid_sampled_token_ids[i].clear()
-            if not vllm_version_is("0.9.1"):
-                # Cache the sampled tokens in the model runner, so that the schedulerAdd commentMore actions
-                # doesn't need to send them back.
-                # NOTE(woosuk): As an exception, when using PP, the scheduler sends
-                # the sampled tokens back, because there's no direct communication
-                # between the first-stage worker and the last-stage worker.
-                for req_idx, sampled_ids in enumerate(valid_sampled_token_ids):
-                    if not sampled_ids:
-                        continue
+            # Cache the sampled tokens in the model runner, so that the schedulerAdd commentMore actions
+            # doesn't need to send them back.
+            # NOTE(woosuk): As an exception, when using PP, the scheduler sends
+            # the sampled tokens back, because there's no direct communication
+            # between the first-stage worker and the last-stage worker.
+            for req_idx, sampled_ids in enumerate(valid_sampled_token_ids):
+                if not sampled_ids:
+                    continue

-                    start_idx = self.input_batch.num_tokens_no_spec[req_idx]
-                    end_idx = start_idx + len(sampled_ids)
-                    assert end_idx <= self.model_config.max_model_len, (
-                        "Sampled token IDs exceed the max model length. "
-                        f"Total number of tokens: {end_idx} > max_model_len: "
-                        f"{self.model_config.max_model_len}")
+                start_idx = self.input_batch.num_tokens_no_spec[req_idx]
+                end_idx = start_idx + len(sampled_ids)
+                assert end_idx <= self.model_config.max_model_len, (
+                    "Sampled token IDs exceed the max model length. "
+                    f"Total number of tokens: {end_idx} > max_model_len: "
+                    f"{self.model_config.max_model_len}")

-                    self.input_batch.token_ids_cpu[
-                        req_idx, start_idx:end_idx] = sampled_ids
-                    self.input_batch.num_tokens_no_spec[req_idx] = end_idx
-                    self.input_batch.num_tokens[req_idx] = end_idx
-                    req_id = self.input_batch.req_ids[req_idx]
-                    req_state = self.requests[req_id]
-                    req_state.output_token_ids.extend(sampled_ids)
+                self.input_batch.token_ids_cpu[req_idx,
+                                               start_idx:end_idx] = sampled_ids
+                self.input_batch.num_tokens_no_spec[req_idx] = end_idx
+                self.input_batch.num_tokens[req_idx] = end_idx
+                req_id = self.input_batch.req_ids[req_idx]
+                req_state = self.requests[req_id]
+                req_state.output_token_ids.extend(sampled_ids)

            spec_token_ids = self._get_spec_token_ids(
                valid_sampled_token_ids,
@@ -1697,25 +1607,16 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                attn_metadata,
                aux_hidden_states,
            )
-            if vllm_version_is("0.9.1"):
-                model_runner_output = ModelRunnerOutput(
-                    req_ids=self.input_batch.req_ids,
-                    req_id_to_index=self.input_batch.req_id_to_index,
-                    sampled_token_ids=valid_sampled_token_ids,
-                    spec_token_ids=spec_token_ids,
-                    logprobs=logprobs_lists,
-                    prompt_logprobs_dict=prompt_logprobs_dict,
-                )
-            else:
-                model_runner_output = ModelRunnerOutput(
-                    req_ids=self.input_batch.req_ids,
-                    req_id_to_index=self.input_batch.req_id_to_index,
-                    sampled_token_ids=valid_sampled_token_ids,
-                    spec_token_ids=spec_token_ids,
-                    logprobs=logprobs_lists,
-                    prompt_logprobs_dict=prompt_logprobs_dict,
-                    pooler_output=[],
-                )
+
+            model_runner_output = ModelRunnerOutput(
+                req_ids=self.input_batch.req_ids,
+                req_id_to_index=self.input_batch.req_id_to_index,
+                sampled_token_ids=valid_sampled_token_ids,
+                spec_token_ids=spec_token_ids,
+                logprobs=logprobs_lists,
+                prompt_logprobs_dict=prompt_logprobs_dict,
+                pooler_output=[],
+            )

        durations = ProfileExecuteDuration().pop_captured_sync()
        if durations:
@@ -2024,15 +1925,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                                   QKVParallelLinear, RowParallelLinear)):
                        module.weight.data = torch_npu.npu_format_cast(
                            module.weight.data, ACL_FORMAT_FRACTAL_NZ)
-
-            try:
-                # For version compatibility, remove this after we abort vllm v0.9.1 support
-                from vllm.model_executor.models.interfaces import \
-                    has_step_pooler  # type: ignore
-                if has_step_pooler(self.model):
-                    self.input_batch.logits_processing_needs_token_ids = True
-            except ImportError:
-                pass
+            if has_step_pooler(self.model):
+                self.input_batch.logits_processing_needs_token_ids = True
            if self.drafter:
                logger.info("Loading drafter model...")
                if self.use_aux_hidden_state_outputs:
@@ -2362,14 +2256,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):

            # Skip requests that require top-p, top-k, etc.
            req_id = self.input_batch.req_ids[i]
-            if vllm_version_is("0.9.1"):
-                if not is_spec_decode_supported(req_id, self.input_batch):
-                    draft_token_ids.append([])
-                    continue
-            else:
-                if req_id in self.input_batch.spec_decode_unsupported_reqs:
-                    draft_token_ids.append([])
-                    continue
+            if req_id in self.input_batch.spec_decode_unsupported_reqs:
+                draft_token_ids.append([])
+                continue

            # Add sampled_token_ids to token_ids_cpu.
            start_idx = self.input_batch.num_tokens_no_spec[i]
--- a/vllm_ascend/worker/npu_input_batch.py
+++ b/vllm_ascend/worker/npu_input_batch.py
@@ -28,15 +28,13 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.sample.logits_processor import init_builtin_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable

 from vllm_ascend.pool.metadata import PoolingMetadata
-from vllm_ascend.utils import vllm_version_is
-
-if not vllm_version_is("0.9.1"):
-    from vllm.v1.spec_decode.utils import is_spec_decode_unsupported

 _SAMPLING_EPS = 1e-5

@@ -253,17 +251,13 @@ class InputBatch:

        self.req_output_token_ids: list[Optional[list[int]]] = []

-        if not vllm_version_is("0.9.1"):
-            from vllm.v1.sample.logits_processor import \
-                init_builtin_logitsprocs
-
-            # Define logits processors.
-            # TODO(andy): logits processor list should be extensible via engine
-            # constructor argument; for now the list is fixed.
-            self.logitsprocs = init_builtin_logitsprocs(
-                pin_memory_available=pin_memory,
-                max_num_reqs=max_num_reqs + 1,
-                device=device)
+        # Define logits processors.
+        # TODO(andy): logits processor list should be extensible via engine
+        # constructor argument; for now the list is fixed.
+        self.logitsprocs = init_builtin_logitsprocs(
+            pin_memory_available=pin_memory,
+            max_num_reqs=max_num_reqs + 1,
+            device=device)

        # This is updated each time the batch constituents change.
        self.sampling_metadata = self._make_sampling_metadata()
@@ -314,8 +308,8 @@ class InputBatch:
        self.block_table.add_row(request.block_ids, req_index)

        if sampling_params := request.sampling_params:
-            if ((not vllm_version_is("0.9.1")) and self.is_spec_decode
-                    and is_spec_decode_unsupported(sampling_params)):
+            if self.is_spec_decode and is_spec_decode_unsupported(
+                    sampling_params):
                self.spec_decode_unsupported_reqs.add(req_id)
            if sampling_params.sampling_type == SamplingType.GREEDY:
                # Avoid later division by zero.
@@ -641,48 +635,24 @@ class InputBatch:
                       self.allowed_token_ids_mask, num_reqs)
            allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs]

-        if vllm_version_is("0.9.1"):
-            return SamplingMetadata(
-                temperature=temperature,
-                all_greedy=self.all_greedy,
-                all_random=self.all_random,
-                top_p=None if self.no_top_p else self.top_p[:num_reqs],
-                top_k=None if self.no_top_k else self.top_k[:num_reqs],
-                min_p=None if self.no_min_p else self.min_p[:num_reqs],
-                generators=self.generators,
-                max_num_logprobs=self.max_num_logprobs,
-                prompt_token_ids=prompt_token_ids,
-                frequency_penalties=self.frequency_penalties[:num_reqs],
-                presence_penalties=self.presence_penalties[:num_reqs],
-                repetition_penalties=self.repetition_penalties[:num_reqs],
-                output_token_ids=cast(list[list[int]],
-                                      self.req_output_token_ids),
-                min_tokens=self.min_tokens,
-                no_penalties=self.no_penalties,
-                logit_bias=self.logit_bias[:num_reqs],
-                allowed_token_ids_mask=allowed_token_ids_mask,
-                bad_words_token_ids=self.bad_words_token_ids,
-            )
-        else:
-            return SamplingMetadata(
-                temperature=temperature,
-                all_greedy=self.all_greedy,
-                all_random=self.all_random,
-                top_p=None if self.no_top_p else self.top_p[:num_reqs],
-                top_k=None if self.no_top_k else self.top_k[:num_reqs],
-                generators=self.generators,
-                max_num_logprobs=self.max_num_logprobs,
-                prompt_token_ids=prompt_token_ids,
-                frequency_penalties=self.frequency_penalties[:num_reqs],
-                presence_penalties=self.presence_penalties[:num_reqs],
-                repetition_penalties=self.repetition_penalties[:num_reqs],
-                output_token_ids=cast(list[list[int]],
-                                      self.req_output_token_ids),
-                no_penalties=self.no_penalties,
-                allowed_token_ids_mask=allowed_token_ids_mask,
-                bad_words_token_ids=self.bad_words_token_ids,
-                logitsprocs=self.logitsprocs,
-            )
+        return SamplingMetadata(
+            temperature=temperature,
+            all_greedy=self.all_greedy,
+            all_random=self.all_random,
+            top_p=None if self.no_top_p else self.top_p[:num_reqs],
+            top_k=None if self.no_top_k else self.top_k[:num_reqs],
+            generators=self.generators,
+            max_num_logprobs=self.max_num_logprobs,
+            prompt_token_ids=prompt_token_ids,
+            frequency_penalties=self.frequency_penalties[:num_reqs],
+            presence_penalties=self.presence_penalties[:num_reqs],
+            repetition_penalties=self.repetition_penalties[:num_reqs],
+            output_token_ids=cast(list[list[int]], self.req_output_token_ids),
+            no_penalties=self.no_penalties,
+            allowed_token_ids_mask=allowed_token_ids_mask,
+            bad_words_token_ids=self.bad_words_token_ids,
+            logitsprocs=self.logitsprocs,
+        )

    @property
    def pooling_metadata(self) -> PoolingMetadata: