diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml
index cb6608fa..04f2aa72 100644
--- a/.github/workflows/_e2e_nightly_multi_node.yaml
+++ b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -32,7 +32,7 @@ on:
         description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
       vllm_version:
         required: false
-        default: "v0.12.0"
+        default: "v0.13.0"
         type: string
         description: vllm version to use
       vllm_ascend_remote_url:
diff --git a/.github/workflows/nightly_test_a2.yaml b/.github/workflows/nightly_test_a2.yaml
index 8fca3bad..57003ef9 100644
--- a/.github/workflows/nightly_test_a2.yaml
+++ b/.github/workflows/nightly_test_a2.yaml
@@ -60,7 +60,7 @@ jobs:
             tests: tests/e2e/nightly/ops
     uses: ./.github/workflows/_e2e_nightly_single_node.yaml
     with:
-      vllm: v0.12.0
+      vllm: v0.13.0
       runner: ${{ matrix.test_config.os }}
       tests: ${{ matrix.test_config.tests }}
       name: ${{ matrix.test_config.name }}
@@ -128,7 +128,7 @@ jobs:
               - Qwen3-VL-30B-A3B-Instruct
     uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
     with:
-      vllm: v0.12.0
+      vllm: v0.13.0
       runner: ${{ matrix.test_config.os }}
       model_list: ${{ toJson(matrix.test_config.model_list) }}
       image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
diff --git a/.github/workflows/nightly_test_a3.yaml b/.github/workflows/nightly_test_a3.yaml
index 916a8b36..3038b541 100644
--- a/.github/workflows/nightly_test_a3.yaml
+++ b/.github/workflows/nightly_test_a3.yaml
@@ -136,7 +136,7 @@ jobs:
           #   tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
     uses: ./.github/workflows/_e2e_nightly_single_node.yaml
     with:
-      vllm: v0.12.0
+      vllm: v0.13.0
       runner: ${{ matrix.test_config.os }}
       image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
       tests: ${{ matrix.test_config.tests }}
@@ -156,7 +156,7 @@ jobs:
     uses: ./.github/workflows/_e2e_nightly_single_node.yaml
     with:
       runner: ${{ matrix.test_config.os }}
-      vllm: v0.12.0
+      vllm: v0.13.0
       image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
       tests: ${{ matrix.test_config.tests }}
       name: ${{ matrix.test_config.name }}
diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml
index 3430699a..e747b5ac 100644
--- a/.github/workflows/pr_test_full.yaml
+++ b/.github/workflows/pr_test_full.yaml
@@ -74,7 +74,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [releases/v0.13.0, v0.12.0]
+        vllm_version: [v0.13.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml
index 50214570..e1aeed2e 100644
--- a/.github/workflows/pr_test_light.yaml
+++ b/.github/workflows/pr_test_light.yaml
@@ -42,7 +42,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: releases/v0.13.0
+      vllm: v0.13.0
   changes:
     runs-on: linux-aarch64-a2-0
     outputs:
@@ -90,7 +90,7 @@ jobs:
         SOC_VERSION: ascend910b1
     strategy:
       matrix:
-        vllm_version: [releases/v0.13.0, v0.12.0]
+        vllm_version: [v0.13.0]
 
     steps:
       - name: Free up disk space
@@ -154,7 +154,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [releases/v0.13.0, v0.12.0]
+        vllm_version: [v0.13.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/.github/workflows/schedule_test_benchmarks.yaml b/.github/workflows/schedule_test_benchmarks.yaml
index 1ba071da..60690ebe 100644
--- a/.github/workflows/schedule_test_benchmarks.yaml
+++ b/.github/workflows/schedule_test_benchmarks.yaml
@@ -51,7 +51,7 @@ jobs:
     strategy:
       matrix:
         include:
-          - vllm_branch: v0.12.0
+          - vllm_branch: v0.13.0
             vllm_ascend_branch: main
       max-parallel: 1
     container:
diff --git a/Dockerfile b/Dockerfile
index 11f38018..b2c0db4d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -48,7 +48,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.12.0
+ARG VLLM_TAG=v0.13.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.310p b/Dockerfile.310p
index 9f9072bd..9ca36ad1 100644
--- a/Dockerfile.310p
+++ b/Dockerfile.310p
@@ -40,7 +40,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.12.0
+ARG VLLM_TAG=v0.13.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler
index f48a3e87..b7758b8c 100644
--- a/Dockerfile.310p.openEuler
+++ b/Dockerfile.310p.openEuler
@@ -36,7 +36,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.12.0
+ARG VLLM_TAG=v0.13.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.a3 b/Dockerfile.a3
index 73187eea..68c0c6b4 100644
--- a/Dockerfile.a3
+++ b/Dockerfile.a3
@@ -47,7 +47,7 @@ RUN apt-get update -y && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.12.0
+ARG VLLM_TAG=v0.13.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler
index 6ec647cb..4edc89a5 100644
--- a/Dockerfile.a3.openEuler
+++ b/Dockerfile.a3.openEuler
@@ -50,7 +50,7 @@ RUN yum update -y && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.12.0
+ARG VLLM_TAG=v0.13.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler
index 8a0534dd..f5acbcf4 100644
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -50,7 +50,7 @@ RUN yum update -y && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.12.0
+ARG VLLM_TAG=v0.13.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md
index 554b598f..02eb1d2d 100644
--- a/docs/source/community/versioning_policy.md
+++ b/docs/source/community/versioning_policy.md
@@ -50,7 +50,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/
 For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | releases/v0.13.0,  v0.12.0 tag | >= 3.10, < 3.12   | 8.3.RC2 | 2.8.0 / 2.8.0 |
+|     main    | v0.13.0 tag | >= 3.10, < 3.12   | 8.3.RC2 | 2.8.0 / 2.8.0 |
 
 ## Release cadence
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 63e1986d..0c636fe4 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -77,7 +77,7 @@ myst_substitutions = {
     # CANN image tag
     'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11",
     # vllm version in ci
-    'ci_vllm_version': 'v0.12.0',
+    'ci_vllm_version': 'release/v0.13.0',
 }
 
 # For cross-file header anchors
diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py
index 1ad608b9..d7a5ecba 100644
--- a/tests/ut/test_platform.py
+++ b/tests/ut/test_platform.py
@@ -3,6 +3,7 @@ from unittest.mock import MagicMock, patch
 
 import pytest
 import torch
+from vllm.attention.selector import AttentionSelectorConfig
 from vllm.config.compilation import CompilationMode, CUDAGraphMode
 from vllm.platforms import PlatformEnum
 
@@ -484,28 +485,30 @@ class TestNPUPlatform(TestBase):
         self.assertEqual(vllm_config.compilation_config.custom_ops, [])
 
     def test_get_attn_backend_cls_use_v1_and_mla(self):
-        result = self.platform.get_attn_backend_cls(
-            selected_backend="ascend",
-            head_size=64,
-            dtype="float16",
-            kv_cache_dtype="float16",
-            block_size=64,
-            use_sparse=False,
+        attn_selector_config = AttentionSelectorConfig(
+            dtype=torch.float16,
+            head_size=0,
+            kv_cache_dtype=None,
+            block_size=128,
             use_mla=True,
+            use_sparse=False,
         )
+        result = self.platform.get_attn_backend_cls("ascend",
+                                                    attn_selector_config)
         self.assertEqual(result,
                          "vllm_ascend.attention.mla_v1.AscendMLABackend")
 
     def test_get_attn_backend_cls_use_v1_only(self):
-        result = self.platform.get_attn_backend_cls(
-            selected_backend="ascend",
-            head_size=64,
-            dtype="float16",
-            kv_cache_dtype="float16",
-            block_size=64,
-            use_sparse=False,
+        attn_selector_config = AttentionSelectorConfig(
+            dtype=torch.float16,
+            head_size=0,
+            kv_cache_dtype=None,
+            block_size=128,
             use_mla=False,
+            use_sparse=False,
         )
+        result = self.platform.get_attn_backend_cls("ascend",
+                                                    attn_selector_config)
         self.assertEqual(
             result,
             "vllm_ascend.attention.attention_v1.AscendAttentionBackend")
diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py
index 2a331ed8..9d913f63 100644
--- a/vllm_ascend/ops/fused_moe/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -274,15 +274,6 @@ class AscendFusedMoE(FusedMoE):
     def update_expert_map(self, new_expert_map):
         self._expert_map = new_expert_map
 
-    @property
-    def expert_map(self) -> torch.Tensor | None:
-        return self._expert_map
-
-    @expert_map.setter
-    def expert_map(self, new_expert_map):
-        # TODO(Potabk): Remove this once we drop vllm v0.12.0(This makes backward compatibility with vllm v0.12.0)
-        self._expert_map = new_expert_map
-
     def get_log2phy_map(self):
         return self.log2phy
 
diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py
index 0dff139f..26c4dc86 100644
--- a/vllm_ascend/patch/platform/__init__.py
+++ b/vllm_ascend/patch/platform/__init__.py
@@ -17,15 +17,10 @@
 import os
 
 import vllm_ascend.patch.platform.patch_distributed  # noqa
+import vllm_ascend.patch.platform.patch_ec_connector  # noqa
 import vllm_ascend.patch.platform.patch_mamba_config  # noqa
 import vllm_ascend.patch.platform.patch_sched_yield  # noqa
-from vllm_ascend.utils import vllm_version_is
 
 if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv(
         "EXPERT_MAP_RECORD", "false") == "true":
     import vllm_ascend.patch.platform.patch_multiproc_executor  # noqa
-
-if vllm_version_is("0.12.0"):
-    import vllm_ascend.patch.platform.patch_ec_connector012  # noqa
-else:
-    import vllm_ascend.patch.platform.patch_ec_connector  # noqa
diff --git a/vllm_ascend/patch/platform/patch_ec_connector012.py b/vllm_ascend/patch/platform/patch_ec_connector012.py
deleted file mode 100644
index f0015738..00000000
--- a/vllm_ascend/patch/platform/patch_ec_connector012.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import vllm.distributed.ec_transfer.ec_connector.shared_storage_connector  # type: ignore[import-not-found]  # noqa
-from safetensors.torch import load_file
-from vllm.distributed.ec_transfer.ec_connector.base import \
-    ECConnectorMetadata  # type: ignore[import-not-found]  # noqa
-from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import (  # type: ignore[import-not-found]  # noqa
-    ECSharedStorageConnector, ECSharedStorageConnectorMetadata)
-from vllm.logger import logger
-
-
-class AscendECSharedStorageConnector(ECSharedStorageConnector):
-
-    def start_load_caches(self, encoder_cache, **kwargs) -> None:
-        metadata: ECConnectorMetadata = self._get_connector_metadata()
-        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
-        assert encoder_cache is not None
-        if metadata is None:
-            logger.warning((
-                "In connector.start_load_caches, ",
-                "but the connector metadata is None",
-            ))
-            return
-        # Load the EC for each mm data
-        for mm_data in metadata.mm_datas:
-            if mm_data.mm_hash in encoder_cache:
-                continue
-            filename = self._generate_filename_debug(mm_data.mm_hash)
-            ec_cache = load_file(filename)["ec_cache"].npu()
-            encoder_cache[mm_data.mm_hash] = ec_cache
-            logger.debug("Success load encoder cache for hash %s",
-                         mm_data.mm_hash)
-
-
-vllm.distributed.ec_transfer.ec_connector.shared_storage_connector.ECSharedStorageConnector = AscendECSharedStorageConnector
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index 12545460..2a70932d 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -351,22 +351,16 @@ class NPUPlatform(Platform):
         CUSTOM_OP_REGISTERED = True
 
     @classmethod
-    def get_attn_backend_cls(cls, selected_backend, *args, **kwargs):
-        if "attn_selector_config" in kwargs:
-            use_mla = kwargs["attn_selector_config"].use_mla
-            use_sparse = kwargs["attn_selector_config"].use_sparse
-        else:
-            use_mla = kwargs.get("use_mla",
-                                 args[4] if len(args) >= 5 else None)
-            use_sparse = kwargs.get("use_sparse",
-                                    args[6] if len(args) >= 7 else None)
+    def get_attn_backend_cls(cls, selected_backend, attn_selector_config):
         backend_map = {
             (True, False): "vllm_ascend.attention.mla_v1.AscendMLABackend",
             (False, False):
             "vllm_ascend.attention.attention_v1.AscendAttentionBackend",
             (True, True): "vllm_ascend.attention.sfa_v1.AscendSFABackend",
         }
-        return backend_map[(use_mla, use_sparse)]
+
+        return backend_map[(attn_selector_config.use_mla,
+                            attn_selector_config.use_sparse)]
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 38cfcd0c..19e8a310 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -116,8 +116,7 @@ from vllm_ascend.spec_decode.interface import SpecDcodeType
 from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
 from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration,
                                enable_sp, get_ascend_device_type, is_moe_model,
-                               lmhead_tp_enable, maybe_trans_nz,
-                               vllm_version_is)
+                               lmhead_tp_enable, maybe_trans_nz)
 from vllm_ascend.worker.npu_input_batch import NPUInputBatch
 
 from vllm_ascend.ascend_forward_context import (  # isort: skip
@@ -243,24 +242,15 @@ class NPUModelRunner(GPUModelRunner):
         # Set up Attention
         self.use_sparse = hasattr(self.vllm_config.model_config.hf_config,
                                   "index_topk")
-        if vllm_version_is('0.12.0'):
-            self.attn_backend = get_attn_backend(
-                0,
-                self.dtype,
-                None,
-                self.block_size,
-                use_mla=self.model_config.use_mla,
-                use_sparse=self.use_sparse)
-        else:
-            self.attn_backend = get_attn_backend(
-                0,
-                self.dtype,
-                None,
-                self.block_size,
-                use_mla=self.model_config.use_mla,
-                use_sparse=self.use_sparse,
-                use_mm_prefix=self.model_config is not None
-                and self.model_config.is_mm_prefix_lm)
+        self.attn_backend = get_attn_backend(
+            0,
+            self.dtype,
+            None,
+            self.block_size,
+            use_mla=self.model_config.use_mla,
+            use_sparse=self.use_sparse,
+            use_mm_prefix=self.model_config is not None
+            and self.model_config.is_mm_prefix_lm)
         self.attn_mask_builder = AttentionMaskBuilder(self.device)
 
         self._set_up_drafter()
@@ -1877,36 +1867,19 @@ class NPUModelRunner(GPUModelRunner):
                         self.speculative_config.method == "mtp":
                     attn_state = AscendAttentionState.SpecDecoding
 
-                if vllm_version_is("0.12.0"):
-                    common_metadata = CommonAttentionMetadata(
-                        query_start_loc=self.query_start_loc.gpu[:num_reqs +
+                common_metadata = CommonAttentionMetadata(
+                    query_start_loc=self.query_start_loc.gpu[:num_reqs + 1],
+                    query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +
                                                                  1],
-                        query_start_loc_cpu=self.query_start_loc.
-                        cpu[:num_reqs + 1],
-                        seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
-                        seq_lens=self.seq_lens.cpu[:num_reqs],
-                        num_reqs=num_reqs,
-                        num_actual_tokens=num_tokens,
-                        block_table_tensor=block_table_tensor[:num_reqs],
-                        slot_mapping=slot_mapping.gpu,
-                        num_computed_tokens_cpu=num_computed_tokens_cpu,
-                        max_query_len=max_query_len,
-                        max_seq_len=seq_lens)
-                else:
-                    common_metadata = CommonAttentionMetadata(
-                        query_start_loc=self.query_start_loc.gpu[:num_reqs +
-                                                                 1],
-                        query_start_loc_cpu=self.query_start_loc.
-                        cpu[:num_reqs + 1],
-                        _seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
-                        seq_lens=self.seq_lens.cpu[:num_reqs],
-                        num_reqs=num_reqs,
-                        num_actual_tokens=num_tokens,
-                        block_table_tensor=block_table_tensor[:num_reqs],
-                        slot_mapping=slot_mapping.gpu,
-                        _num_computed_tokens_cpu=num_computed_tokens_cpu,
-                        max_query_len=max_query_len,
-                        max_seq_len=seq_lens)
+                    _seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
+                    seq_lens=self.seq_lens.cpu[:num_reqs],
+                    num_reqs=num_reqs,
+                    num_actual_tokens=num_tokens,
+                    block_table_tensor=block_table_tensor[:num_reqs],
+                    slot_mapping=slot_mapping.gpu,
+                    _num_computed_tokens_cpu=num_computed_tokens_cpu,
+                    max_query_len=max_query_len,
+                    max_seq_len=seq_lens)
 
                 for attn_group in self.attn_groups[kv_cache_group_id]:
                     builder = attn_group.get_metadata_builder()
diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py
index 846c0d83..2777ea9f 100644
--- a/vllm_ascend/worker/npu_input_batch.py
+++ b/vllm_ascend/worker/npu_input_batch.py
@@ -22,6 +22,7 @@ import torch
 from vllm.lora.request import LoRARequest
 from vllm.pooling_params import PoolingParams
 from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.pool.metadata import PoolingStates
 from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
                                              LogitsProcessors)
 from vllm.v1.worker.gpu_input_batch import InputBatch
@@ -29,16 +30,6 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm_ascend.worker.block_table import MultiGroupBlockTable
 
 
-class PoolingStates:
-    # NOTE: This should be removed after we drop support of vLLM v0.12.0
-    def __init__(self):
-        # for chunked prefill with ALL pooling
-        self.hidden_states_cache: list[torch.Tensor] = []
-
-    def clean(self):
-        self.hidden_states_cache.clear()
-
-
 class NPUInputBatch(InputBatch):
 
     def __init__(