diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml
index d1a6123..999fb6a 100644
--- a/.github/workflows/accuracy_test.yaml
+++ b/.github/workflows/accuracy_test.yaml
@@ -34,8 +34,7 @@ on:
         # Current supported vLLM versions
         options:
           - main
-          - v0.9.0.1
-          - v0.9.0
+          - v0.9.1
           - v0.7.3
       vllm-ascend-version:
         description: 'vllm-ascend version:'
@@ -159,7 +158,7 @@ jobs:
           repository: vllm-project/vllm
           path: ./vllm-empty
           # Please also update this when bump matched version
-          ref: ${{ github.event.inputs.vllm-version || 'v0.9.0' }}
+          ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }}
 
       - name: Install vllm-project/vllm from source
         working-directory: ./vllm-empty
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
deleted file mode 100644
index 91cd9c4..0000000
--- a/.github/workflows/actionlint.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-#
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Adapted from vllm-project/vllm/blob/main/.github
-#
-
-name: Lint GitHub Actions workflows
-on:
-  pull_request:
-    branches:
-      - 'main'
-      - '*-dev'
-    paths:
-      - '.github/workflows/*.ya?ml'
-      - '.github/workflows/actionlint.*'
-      - '.github/workflows/matchers/actionlint.json'
-
-env:
-  LC_ALL: en_US.UTF-8
-
-defaults:
-  run:
-    shell: bash
-
-permissions:
-  contents: read
-
-jobs:
-  actionlint:
-    runs-on: ubuntu-latest
-    steps:
-      - name: "Checkout"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: "Run actionlint"
-        env:
-          SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086
-        run: |
-          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
-          tools/actionlint.sh -color
diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml
index da4dbcc..6ee1b45 100644
--- a/.github/workflows/nightly_benchmarks.yaml
+++ b/.github/workflows/nightly_benchmarks.yaml
@@ -50,7 +50,7 @@ jobs:
     strategy:
       matrix:
         include:
-          - vllm_branch: v0.9.0
+          - vllm_branch: v0.9.1
             vllm_ascend_branch: main
     container:
       image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index b023502..073058d 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -33,6 +33,9 @@ on:
       - '!benchmarks/**'
       - 'tools/mypy.sh'
       - 'mypy.ini'
+      - '.github/workflows/*.ya?ml'
+      - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
 
 # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
 # declared as "shell: bash -el {0}" on steps that need to be properly activated.
@@ -87,6 +90,13 @@ jobs:
           repository: vllm-project/vllm
           path: vllm-empty
 
+      - name: Actionlint Check
+        env:
+          SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+          tools/actionlint.sh -color
+
       - name: Install vllm-project/vllm from source
         working-directory: vllm-empty
         run: |
@@ -105,7 +115,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1, linux-arm64-npu-4]
-        vllm_version: [main, v0.9.0]
+        vllm_version: [main, v0.9.1]
     concurrency:
       group: >
         ${{
@@ -193,6 +203,7 @@ jobs:
           fi
 
       - name: Run vllm-project/vllm-ascend test on V0 engine
+        if: ${{ github.event_name == 'schedule' }}
         env:
           VLLM_USE_V1: 0
         run: |
diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml
index 2cc8917..c17200a 100644
--- a/.github/workflows/vllm_ascend_test_long_term.yaml
+++ b/.github/workflows/vllm_ascend_test_long_term.yaml
@@ -43,7 +43,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1, linux-arm64-npu-4]
-        vllm_version: [main, v0.9.0]
+        vllm_version: [main, v0.9.1]
     name: vLLM Ascend long term test
     runs-on: ${{ matrix.os }}
     container:
diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml
index 7548b07..c2c76c9 100644
--- a/.github/workflows/vllm_ascend_test_pd.yaml
+++ b/.github/workflows/vllm_ascend_test_pd.yaml
@@ -41,7 +41,7 @@ jobs:
     if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
     strategy:
       matrix:
-        vllm_verison: [main, v0.9.0]
+        vllm_verison: [main, v0.9.1]
     name: vLLM Ascend prefilling decoding disaggregation test
     runs-on: linux-arm64-npu-static-8
 
diff --git a/Dockerfile b/Dockerfile
index 1dfd10c..952e77f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.0
+ARG VLLM_TAG=v0.9.1
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler
index ffd1174..2ff3d0b 100644
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.0
+ARG VLLM_TAG=v0.9.1
 
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
diff --git a/tests/singlecard/compile/test_simple.py b/tests/singlecard/compile/test_simple.py
index 64d4cba..70b8929 100644
--- a/tests/singlecard/compile/test_simple.py
+++ b/tests/singlecard/compile/test_simple.py
@@ -14,8 +14,6 @@ from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
                          set_current_vllm_config)
 from vllm.utils import direct_register_custom_op
 
-from vllm_ascend.utils import vllm_version_is
-
 global_counter = 0
 
 # create a library to hold the custom op
@@ -93,28 +91,14 @@ def test_simple_piecewise_compile():
         model = SillyModel(vllm_config=vllm_config, prefix="")
 
     inputs = torch.randn(100).npu()
-
-    if vllm_version_is("0.9.0"):
-        kwargs = {
-            "num_graphs_seen": 1,  # one graph for the model
-            "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
-            "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
-            "num_backend_compilations":
-            3,  # num_piecewise_capturable_graphs_seen
-            "num_cudagraph_caputured":
-            6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-        }
-    else:
-        kwargs = {
-            "num_graphs_seen": 1,  # one graph for the model
-            "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
-            "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
-            "num_backend_compilations":
-            3,  # num_piecewise_capturable_graphs_seen
-            "num_cudagraph_captured":
-            6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-        }
-
+    kwargs = {
+        "num_graphs_seen": 1,  # one graph for the model
+        "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
+        "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
+        "num_backend_compilations": 3,  # num_piecewise_capturable_graphs_seen
+        "num_cudagraph_captured":
+        6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    }
     with compilation_counter.expect(kwargs):
 
         model(inputs)
diff --git a/tests/singlecard/test_scheduler.py b/tests/singlecard/test_scheduler.py
index d1c6062..8021f03 100644
--- a/tests/singlecard/test_scheduler.py
+++ b/tests/singlecard/test_scheduler.py
@@ -31,7 +31,6 @@ from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 
 from vllm_ascend.core.scheduler import AscendScheduler
-from vllm_ascend.utils import vllm_version_is
 
 EOS_TOKEN_ID = 50256
 
@@ -87,27 +86,15 @@ def create_scheduler(
     vllm_config = VllmConfig(scheduler_config=scheduler_config,
                              model_config=model_config,
                              cache_config=cache_config)
-
-    if vllm_version_is("0.9.0"):
-        kv_cache_config = KVCacheConfig(
-            num_blocks=10000,  # A large number of blocks to hold all requests
-            tensors={},
-            kv_cache_groups=[
-                KVCacheGroupSpec(['layer'],
-                                 FullAttentionSpec(16, 1, 1, torch.float32,
-                                                   False))
-            ],
-        )
-    else:
-        kv_cache_config = KVCacheConfig(
-            num_blocks=10000,  # A large number of blocks to hold all requests
-            kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])],
-            kv_cache_groups=[
-                KVCacheGroupSpec(['layer'],
-                                 FullAttentionSpec(16, 1, 1, torch.float32,
-                                                   False, None))
-            ],
-        )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=10000,  # A large number of blocks to hold all requests
+        kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])],
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(16, 1, 1, torch.float32, False,
+                                               None))
+        ],
+    )
     cache_config.num_gpu_blocks = 10000
     return AscendScheduler(
         vllm_config,
@@ -135,27 +122,15 @@ def create_requests(num_requests: int,
         else:
             mm_position = None
             mm_inputs = None
-        if vllm_version_is("0.9.0"):
-            request = Request(
-                request_id=f"{i}",
-                prompt_token_ids=[i] * num_tokens,
-                sampling_params=sampling_params,
-                multi_modal_inputs=mm_inputs,
-                multi_modal_placeholders=mm_position,
-                multi_modal_hashes=None,
-                arrival_time=0,
-                eos_token_id=EOS_TOKEN_ID,
-            )
-        else:
-            request = Request(
-                request_id=f"{i}",
-                prompt_token_ids=[i] * num_tokens,
-                sampling_params=sampling_params,
-                multi_modal_inputs=mm_inputs,
-                multi_modal_placeholders=mm_position,
-                multi_modal_hashes=None,
-                eos_token_id=EOS_TOKEN_ID,
-            )
+        request = Request(
+            request_id=f"{i}",
+            prompt_token_ids=[i] * num_tokens,
+            sampling_params=sampling_params,
+            multi_modal_inputs=mm_inputs,
+            multi_modal_placeholders=mm_position,
+            multi_modal_hashes=None,
+            eos_token_id=EOS_TOKEN_ID,
+        )
         requests.append(request)
     return requests
 
diff --git a/vllm_ascend/compilation/piecewise_backend.py b/vllm_ascend/compilation/piecewise_backend.py
index 95ce693..c6a800b 100644
--- a/vllm_ascend/compilation/piecewise_backend.py
+++ b/vllm_ascend/compilation/piecewise_backend.py
@@ -31,8 +31,6 @@ from vllm.config import VllmConfig
 from vllm.logger import logger
 from vllm.utils import weak_ref_tensors
 
-from vllm_ascend.utils import vllm_version_is
-
 
 @dataclasses.dataclass
 class ConcreteSizeEntry:
@@ -206,11 +204,7 @@ class NPUPiecewiseBackend:
             # to save memory
             entry.output = weak_ref_tensors(output)
             entry.aclgraph = aclgraph
-
-            if vllm_version_is("0.9.0"):
-                compilation_counter.num_cudagraph_caputured += 1
-            else:
-                compilation_counter.num_cudagraph_captured += 1
+            compilation_counter.num_cudagraph_captured += 1
 
             # important: we need to return the output, rather than
             # the weak ref of the output, so that pytorch can correctly
diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py
index 42f5d9c..05c663f 100644
--- a/vllm_ascend/core/scheduler.py
+++ b/vllm_ascend/core/scheduler.py
@@ -29,8 +29,6 @@ from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 
-from vllm_ascend.utils import vllm_version_is
-
 
 class AscendScheduler(Scheduler):
     """This Scheduler extends vllm's original v1 scheduler
@@ -129,12 +127,7 @@ class AscendScheduler(Scheduler):
                 continue
 
             assert num_new_tokens > 0
-
-            if vllm_version_is("0.9.0"):
-                blocks = computed_blocks.blocks
-            else:
-                blocks = computed_blocks.blocks[0]
-
+            blocks = computed_blocks.blocks[0]
             watermark = getattr(self.scheduler_config, "watermark", 0.01)
             if not self._check_watermark_for_prefill(request, num_new_tokens,
                                                      blocks, watermark):
@@ -330,14 +323,8 @@ class AscendScheduler(Scheduler):
                                len(computed_blocks) * self.block_size)
         num_required_blocks = cdiv(num_new_tokens + num_computed_tokens,
                                    self.block_size)
-
-        if vllm_version_is("0.9.0"):
-            req_blocks = self.kv_cache_manager.single_type_manager.req_to_blocks[
-                request.request_id]
-        else:
-            req_blocks = self.kv_cache_manager.coordinator.get_blocks(
-                request.request_id)
-
+        req_blocks = self.kv_cache_manager.coordinator.get_blocks(
+            request.request_id)
         num_new_blocks = (num_required_blocks - len(req_blocks) -
                           len(computed_blocks))
         num_evictable_computed_blocks = sum(1 for blk in computed_blocks
diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py
index 4be92c2..3c24bfc 100644
--- a/vllm_ascend/patch/__init__.py
+++ b/vllm_ascend/patch/__init__.py
@@ -24,9 +24,9 @@
 #           each worker's `__init__` function.
 #
 # Then in each kind of patch, there are three folders:
-# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0.
+# - patch_0_9_1: contains the patches applied when vllm version is 0.9.1.
 # - patch_main: contains the patches applied when vllm version is main branch.
-# - patch_common: contains the patches applied in both 0.9.0 and main branch.
+# - patch_common: contains the patches applied in both 0.9.1 and main branch.
 #
 # Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
 # ----------------------------------------------------------------------------------
@@ -35,17 +35,6 @@
 # --------------------------------
 # * Platform Patch:
 # =================
-# ** File: platform/patch_0_9_0/patch_distributed.py**
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. `vllm.distributed.utils.stateless_init_torch_distributed_process_group()`
-#    Why:
-#       vllm distributed use gloo backend by default to initialize stateless process group, but we want to use hccl here
-#    How：
-#       Add hccl backend to the `stateless_init_torch_distributed_process_group`
-#    Related PR (if no, explain why):
-#       https://github.com/vllm-project/vllm/pull/18763
-#    Future Plan:
-#       Remove this patch once vllm is upgraded to 0.9.1
 # ** File: platform/patch_common/patch_distributed.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.distributed.parallel_state.destroy_model_parallel()`
diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py
index e724fe5..4ec38e3 100644
--- a/vllm_ascend/patch/platform/__init__.py
+++ b/vllm_ascend/patch/platform/__init__.py
@@ -17,8 +17,8 @@
 from vllm_ascend.utils import vllm_version_is
 
 # Import specific patches for different versions
-if vllm_version_is("0.9.0"):
-    from vllm_ascend.patch.platform import patch_0_9_0  # noqa: F401
+if vllm_version_is("0.9.1"):
+    from vllm_ascend.patch.platform import patch_0_9_1  # noqa: F401
     from vllm_ascend.patch.platform import patch_common  # noqa: F401
 else:
     from vllm_ascend.patch.platform import patch_common  # noqa: F401
diff --git a/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py b/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py
deleted file mode 100644
index d468326..0000000
--- a/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import torch
-from torch.distributed import ProcessGroup
-from torch.distributed.distributed_c10d import (Backend, PrefixStore,
-                                                _get_default_timeout,
-                                                is_nccl_available)
-from torch.distributed.rendezvous import rendezvous
-from vllm.distributed import utils
-
-
-def stateless_init_torch_distributed_process_group(
-        host: str, port: int, rank: int, world_size: int,
-        backend: str) -> ProcessGroup:
-    """
-    A replacement for `torch.distributed.init_process_group` that does not
-    pollute the global state. The created ProcessGroup object can be used for
-    some operations such as `allreduce`, because it does not depend on the
-    global rank. However, some operations such as `broadcast` cannot be used
-    because it depends on the global rank.
-
-    # TODO: ask for help from PyTorch team if we need the `broadcast` operation.
-
-    This function is useful when we are not sure about the total number of
-    processes in the process group. For example, we may have process
-    1, 2, ..., 8 who want to communicate, and process 9 might be the same
-    process as process 1, or it might be a different process; process 10
-    might be the same process as process 5, or it might be a different process.
-    In this case, how can we reliably form a communication channel within
-    process 9 and 10, without affecting the communication channel within
-    process 1, 2, ..., 8?
-
-    One possible solution is to figure out if process 9 and 10 are the same
-    as process 1 and 5 beforehand, and then form a communication channel
-    based on the information, adjusting the ranks and world_size etc. However,
-    figuring out the information is not always easy, and it will interfere
-    with the main communication channel.
-
-    Our solution is to always form a communication channel with process 1, 2,
-    ..., 8, and then use this function to form another communication channel
-    with process 9 and 10. This way, regardless of whether process 9 and 10
-    are the same as process 1 and 5, the main communication channel is
-    always formed with process 1, 2, ..., 8, and the additional communication
-    channel is formed with process 9 and 10.
-    """
-    init_method = f"tcp://{host}:{port}"
-    backend = Backend(backend)  # it is basically string
-    timeout = _get_default_timeout(backend)
-
-    store, rank, world_size = next(
-        rendezvous(init_method, rank, world_size, timeout=timeout))
-    store.set_timeout(timeout)
-
-    group_rank = rank
-    group_size = world_size
-
-    # Use a PrefixStore to avoid accidental overrides of keys used by
-    # different systems (e.g. RPC) in case the store is multi-tenant.
-    prefix_store = PrefixStore(init_method, store)
-
-    # TODO(Yizhou): The reason we need to set options while vllm does not
-    # seems to be related to the version of PyTorch. In the latest version,
-    # there is no need to set options. While in the older version, 2.5.1
-    # specifically, we need to set options.
-    options = ProcessGroup.Options(backend=backend)
-    pg: ProcessGroup = ProcessGroup(
-        prefix_store,
-        group_rank,
-        group_size,
-        options,
-    )
-    if backend == "gloo":
-        from torch.distributed.distributed_c10d import ProcessGroupGloo
-        backend_class = ProcessGroupGloo(prefix_store,
-                                         group_rank,
-                                         group_size,
-                                         timeout=timeout)
-        backend_type = ProcessGroup.BackendType.GLOO
-        device = torch.device("cpu")
-    elif backend == "nccl":
-        assert is_nccl_available()
-        from torch.distributed.distributed_c10d import ProcessGroupNCCL
-
-        backend_options = ProcessGroupNCCL.Options()
-        backend_options._timeout = timeout
-
-        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
-                                         backend_options)
-        backend_type = ProcessGroup.BackendType.NCCL
-        device = torch.device("cuda")
-    elif backend == "hccl":
-        from torch.distributed import is_hccl_available
-        assert is_hccl_available()
-        from torch_npu._C._distributed_c10d import ProcessGroupHCCL
-        backend_options = ProcessGroupHCCL.Options()
-        backend_options._timeout = timeout
-        backend_class = ProcessGroupHCCL(prefix_store, group_rank, group_size,
-                                         backend_options)
-        device = torch.device("npu")
-        backend_class._set_sequence_number_for_group()
-        backend_type = ProcessGroup.BackendType.CUSTOM
-        pg._register_backend(device, backend_type, backend_class)
-        return pg
-    else:
-        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
-
-    # TODO(Yizhou): Like we mentioned above, _set_default_backend is not
-    # implemented in the 2.5.1 version of PyTorch. But we need to set it
-    # after the latest version is released.
-    # pg._set_default_backend(backend_type)
-    backend_class._set_sequence_number_for_group()
-
-    pg._register_backend(device, backend_type, backend_class)
-
-    return pg
-
-
-utils.stateless_init_torch_distributed_process_group = stateless_init_torch_distributed_process_group
diff --git a/vllm_ascend/patch/worker/patch_0_9_0/__init__.py b/vllm_ascend/patch/platform/patch_0_9_1/__init__.py
similarity index 100%
rename from vllm_ascend/patch/worker/patch_0_9_0/__init__.py
rename to vllm_ascend/patch/platform/patch_0_9_1/__init__.py
diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py
index d1d3d42..3b29856 100644
--- a/vllm_ascend/patch/worker/__init__.py
+++ b/vllm_ascend/patch/worker/__init__.py
@@ -18,8 +18,8 @@
 from vllm_ascend.utils import vllm_version_is
 
 # Import specific patches for different versions
-if vllm_version_is("0.9.0"):
-    from vllm_ascend.patch.worker import patch_0_9_0  # noqa: F401
+if vllm_version_is("0.9.1"):
+    from vllm_ascend.patch.worker import patch_0_9_1  # noqa: F401
     from vllm_ascend.patch.worker import patch_common  # noqa: F401
 else:
     from vllm_ascend.patch.worker import patch_common  # noqa: F401
diff --git a/vllm_ascend/patch/platform/patch_0_9_0/__init__.py b/vllm_ascend/patch/worker/patch_0_9_1/__init__.py
similarity index 90%
rename from vllm_ascend/patch/platform/patch_0_9_0/__init__.py
rename to vllm_ascend/patch/worker/patch_0_9_1/__init__.py
index f0ac162..116c73c 100644
--- a/vllm_ascend/patch/platform/patch_0_9_0/__init__.py
+++ b/vllm_ascend/patch/worker/patch_0_9_1/__init__.py
@@ -14,4 +14,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import vllm_ascend.patch.platform.patch_0_9_0.patch_distributed  # noqa
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 9b0a860..c358793 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -74,7 +74,7 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
-from vllm_ascend.utils import ProfileExecuteDuration, vllm_version_is
+from vllm_ascend.utils import ProfileExecuteDuration
 from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
 
 if TYPE_CHECKING:
@@ -1614,44 +1614,27 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         import torch_npu
         kv_caches: Dict[str, torch.Tensor] = {}
 
-        # Remove this after we drop 0.9.0 support
-        if vllm_version_is("0.9.0"):
-            self.input_batch = InputBatch(
-                max_num_reqs=self.max_num_reqs,
-                max_model_len=self.model_config.max_model_len,
-                max_num_batched_tokens=self.max_num_tokens,
-                device=self.device,
-                pin_memory=True,
-                vocab_size=self.model_config.get_vocab_size(),
-                block_size=self.cache_config.block_size,
-            )
-        else:
-            self.input_batch = InputBatch(
-                max_num_reqs=self.max_num_reqs,
-                max_model_len=self.model_config.max_model_len,
-                max_num_batched_tokens=self.max_num_tokens,
-                device=self.device,
-                pin_memory=True,
-                vocab_size=self.model_config.get_vocab_size(),
-                block_sizes=[self.cache_config.block_size],
-            )
+        self.input_batch = InputBatch(
+            max_num_reqs=self.max_num_reqs,
+            max_model_len=self.model_config.max_model_len,
+            max_num_batched_tokens=self.max_num_tokens,
+            device=self.device,
+            pin_memory=True,
+            vocab_size=self.model_config.get_vocab_size(),
+            block_sizes=[self.cache_config.block_size],
+        )
 
-        if not vllm_version_is("0.9.0"):
-            kv_cache_sizes = {}
-            for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
-                assert len(kv_cache_tensor.shared_by) == 1, (
-                    "KV cache tensor shared by multiple layers is not supported in "
-                    "NPU.")
-                kv_cache_sizes[
-                    kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
+        kv_cache_sizes = {}
+        for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+            assert len(kv_cache_tensor.shared_by) == 1, (
+                "KV cache tensor shared by multiple layers is not supported in "
+                "NPU.")
+            kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
 
         for kv_cache_group in kv_cache_config.kv_cache_groups:
             kv_cache_spec = kv_cache_group.kv_cache_spec
             for layer_name in kv_cache_group.layer_names:
-                if vllm_version_is("0.9.0"):
-                    tensor_size = kv_cache_config.tensors[layer_name].size
-                else:
-                    tensor_size = kv_cache_sizes[layer_name]
+                tensor_size = kv_cache_sizes[layer_name]
                 assert tensor_size % kv_cache_spec.page_size_bytes == 0
                 num_blocks = tensor_size // kv_cache_spec.page_size_bytes