diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml
index 6ba035cf..4240549b 100644
--- a/.github/workflows/bot_pr_create.yaml
+++ b/.github/workflows/bot_pr_create.yaml
@@ -34,7 +34,7 @@ jobs:
     steps:
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=7157596103666ee7ccb7008acee8bff8a8ff1731
+          VLLM_COMMIT=8be6432bdaf6275664d857b1e5e9bf8ed1ce299e
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
 
       - name: Checkout repository
diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml
index 7a8d6b44..50450b42 100644
--- a/.github/workflows/pr_test_full.yaml
+++ b/.github/workflows/pr_test_full.yaml
@@ -74,7 +74,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0]
+        vllm_version: [8be6432bdaf6275664d857b1e5e9bf8ed1ce299e, v0.13.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml
index 3a586e45..816aecb5 100644
--- a/.github/workflows/pr_test_light.yaml
+++ b/.github/workflows/pr_test_light.yaml
@@ -42,7 +42,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: 7157596103666ee7ccb7008acee8bff8a8ff1731
+      vllm: 8be6432bdaf6275664d857b1e5e9bf8ed1ce299e
   changes:
     runs-on: linux-aarch64-a2-0
     outputs:
@@ -90,7 +90,7 @@ jobs:
         SOC_VERSION: ascend910b1
     strategy:
       matrix:
-        vllm_version: [7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0]
+        vllm_version: [8be6432bdaf6275664d857b1e5e9bf8ed1ce299e, v0.13.0]
 
     steps:
       - name: Free up disk space
@@ -163,7 +163,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0]
+        vllm_version: [8be6432bdaf6275664d857b1e5e9bf8ed1ce299e, v0.13.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md
index 211867da..fdd3249f 100644
--- a/docs/source/community/versioning_policy.md
+++ b/docs/source/community/versioning_policy.md
@@ -51,7 +51,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/
 For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | 7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0 tag | >= 3.10, < 3.12   | 8.3.RC2 | 2.8.0 / 2.8.0 |
+|     main    | 8be6432bdaf6275664d857b1e5e9bf8ed1ce299e, v0.13.0 tag | >= 3.10, < 3.12   | 8.3.RC2 | 2.8.0 / 2.8.0 |
 
 ## Release cadence
 
diff --git a/tests/e2e/multicard/test_aclgraph_capture_replay.py b/tests/e2e/multicard/test_aclgraph_capture_replay.py
index 38a931fb..c06f1a07 100644
--- a/tests/e2e/multicard/test_aclgraph_capture_replay.py
+++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py
@@ -28,7 +28,8 @@ from vllm.utils.network_utils import get_open_port
 from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
 
 MODELS = [
-    "Qwen/Qwen3-0.6B",
+    # Offline data parallel mode will be not supported/useful for dense models
+    # "Qwen/Qwen3-0.6B",
     "vllm-ascend/DeepSeek-V2-Lite-W8A8",
 ]
 
diff --git a/tests/e2e/multicard/test_data_parallel.py b/tests/e2e/multicard/test_data_parallel.py
index 6b4df381..c197181b 100644
--- a/tests/e2e/multicard/test_data_parallel.py
+++ b/tests/e2e/multicard/test_data_parallel.py
@@ -27,9 +27,7 @@ from unittest.mock import patch
 
 import pytest
 
-MODELS = [
-    "Qwen/Qwen3-0.6B", "Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"
-]
+MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/e2e/multicard/test_data_parallel_tp2.py b/tests/e2e/multicard/test_data_parallel_tp2.py
index 03b2d665..ceee5a64 100644
--- a/tests/e2e/multicard/test_data_parallel_tp2.py
+++ b/tests/e2e/multicard/test_data_parallel_tp2.py
@@ -9,7 +9,7 @@ from unittest.mock import patch
 
 import pytest
 
-MODELS = ["Qwen/Qwen3-0.6B"]
+MODELS = ["Qwen/Qwen3-30B-A3B"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py
index 9f2f61aa..efbc3cdc 100755
--- a/tests/ut/attention/test_mla_v1.py
+++ b/tests/ut/attention/test_mla_v1.py
@@ -17,6 +17,7 @@ from vllm_ascend.attention.mla_v1 import (AscendMLABackend,
                                           AscendMLAPrefillMetadata,
                                           ChunkedContextMetadata)
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
+from vllm_ascend.utils import vllm_version_is
 
 
 class TestAscendMLABackend(TestBase):
@@ -392,7 +393,10 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
         self.mock_vllm_config.model_config = model_config
         self.kv_cache_spec = MagicMock()
         self.kv_cache_spec.num_layers = 32
-        self.kv_cache_spec.head_size = 128
+        if vllm_version_is('0.13.0'):
+            self.kv_cache_spec.head_size = 128
+        else:
+            self.kv_cache_spec.head_size = 64
         self.kv_cache_spec.num_heads = 32
 
     @patch("vllm_ascend.attention.mla_v1.get_cos_and_sin_mla")
diff --git a/tests/ut/conftest.py b/tests/ut/conftest.py
index bd8bc4df..77f0ec27 100644
--- a/tests/ut/conftest.py
+++ b/tests/ut/conftest.py
@@ -18,13 +18,6 @@
 import sys
 from unittest.mock import MagicMock
 
-from vllm_ascend.utils import adapt_patch  # noqa E402
-from vllm_ascend.utils import register_ascend_customop
-
-# triton and torch_npu is not available in the environment, so we need to mock them
-sys.modules['torch_npu'].npu.current_device = MagicMock(return_value=0)
-sys.modules['torch_npu._inductor'] = MagicMock()
-
 triton_runtime = MagicMock()
 triton_runtime.driver.active.utils.get_device_properties.return_value = {
     'num_aic': 8,
@@ -32,6 +25,13 @@ triton_runtime.driver.active.utils.get_device_properties.return_value = {
 }
 sys.modules['triton.runtime'] = triton_runtime
 
+from vllm_ascend.utils import adapt_patch  # noqa E402
+from vllm_ascend.utils import register_ascend_customop  # noqa E402
+
+# triton and torch_npu is not available in the environment, so we need to mock them
+sys.modules['torch_npu'].npu.current_device = MagicMock(return_value=0)
+sys.modules['torch_npu._inductor'] = MagicMock()
+
 adapt_patch()
 adapt_patch(True)
 
diff --git a/tests/ut/spec_decode/test_eagle_proposer.py b/tests/ut/spec_decode/test_eagle_proposer.py
index 2ecd7db1..5c037ed4 100644
--- a/tests/ut/spec_decode/test_eagle_proposer.py
+++ b/tests/ut/spec_decode/test_eagle_proposer.py
@@ -58,7 +58,6 @@ class TestEagleProposerInitialization(TestBase):
                                  device=self.device,
                                  runner=self.runner)
 
-        self.assertEqual(proposer.block_size, 16)
         self.assertEqual(proposer.hidden_size, 4096)
         self.assertTrue(proposer.use_cuda_graph)
 
diff --git a/tests/ut/spec_decode/test_mtp_proposer.py b/tests/ut/spec_decode/test_mtp_proposer.py
index c3d62dc5..d6915cfb 100644
--- a/tests/ut/spec_decode/test_mtp_proposer.py
+++ b/tests/ut/spec_decode/test_mtp_proposer.py
@@ -86,7 +86,6 @@ class TestMtpProposer:
         assert proposer.dtype == torch.float16
         assert proposer.num_speculative_tokens == 2
         assert proposer.hidden_size == 4096
-        assert proposer.block_size == 16
 
         # Test with mrope enabled
         assert hasattr(proposer, "positions")
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index cbdb7da3..8eb89b97 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -197,6 +197,7 @@ class AscendAttentionMetadataBuilder(AttentionMetadataBuilder[AscendMetadata]):
         vllm_config: VllmConfig,
         device: torch.device,
     ):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.compilation_config = vllm_config.compilation_config
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
index 1e45af53..625908cd 100644
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -136,6 +136,7 @@ class EagleProposer(VllmEagleProposer):
         draft_attn_layer_names = draft_attn_layer_names - draft_indexer_layer_names
         assert len(draft_attn_layer_names) == 1
         self.attn_layer_name = list(draft_attn_layer_names)
+        self.attn_layer_names = self.attn_layer_name
 
         # share embed_tokens with the target model if needed
         if get_pp_group().world_size == 1:
@@ -442,14 +443,19 @@ class EagleProposer(VllmEagleProposer):
             # For the requests that exceed the max model length, we set the
             # TODO: sequence length to 1 to minimize their overheads in attention.
 
+            if self.attn_metadata_builder is None:
+                attn_metadata_builder = self._get_attention_metadata_builder()
+            else:
+                attn_metadata_builder = self.attn_metadata_builder
+            block_size = attn_metadata_builder.kv_cache_spec.block_size
+
             # Compute the slot mapping.
-            block_numbers = (clamped_positions // self.block_size)
+            block_numbers = (clamped_positions // block_size)
             block_ids = attn_metadata.block_tables.gather(
                 dim=1, index=block_numbers.view(-1, 1))
             block_ids = block_ids.view(-1)
-            slot_mapping_tmp = (
-                block_ids * self.vllm_config.cache_config.block_size +
-                clamped_positions % self.block_size)
+            slot_mapping_tmp = (block_ids * block_size +
+                                clamped_positions % block_size)
 
             # Mask out the slot mappings that exceed the max model length.
             # Otherwise, the KV cache will be inadvertently updated with the
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 8017d63a..47f66d13 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -107,7 +107,7 @@ from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
 from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration,
                                enable_sp, get_ascend_device_type, is_moe_model,
                                lmhead_tp_enable, maybe_trans_nz,
-                               set_weight_prefetch_method)
+                               set_weight_prefetch_method, vllm_version_is)
 from vllm_ascend.worker.npu_input_batch import NPUInputBatch
 from vllm_ascend.worker.pcp_utils import PCPManager
 
@@ -1097,12 +1097,20 @@ class NPUModelRunner(GPUModelRunner):
                                              intermediate_tensors,
                                              inputs_embeds):
         assert self.model is not None
-        hidden_states = self.model(
-            input_ids=input_ids,
-            positions=positions,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds,
-            **self._init_model_kwargs(maybe_padded_num_tokens))
+        if vllm_version_is('0.13.0'):
+            hidden_states = self.model(
+                input_ids=input_ids,
+                positions=positions,
+                intermediate_tensors=intermediate_tensors,
+                inputs_embeds=inputs_embeds,
+                **self._init_model_kwargs(maybe_padded_num_tokens))
+        else:
+            hidden_states = self.model(
+                input_ids=input_ids,
+                positions=positions,
+                intermediate_tensors=intermediate_tensors,
+                inputs_embeds=inputs_embeds,
+                **self._init_model_kwargs())
 
         forward_context = get_forward_context()
         if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL \
@@ -1548,10 +1556,16 @@ class NPUModelRunner(GPUModelRunner):
                 logits = None
             else:
                 if self.input_batch.pooling_params:
-                    pool_output = self._pool(
-                        hidden_states,
-                        scheduler_output.total_num_scheduled_tokens,
-                        num_scheduled_tokens_np)
+                    if vllm_version_is('0.13.0'):
+                        pool_output = self._pool(
+                            hidden_states,
+                            scheduler_output.total_num_scheduled_tokens,
+                            num_scheduled_tokens_np)
+                    else:
+                        pool_output = self._pool(
+                            hidden_states,
+                            scheduler_output.total_num_scheduled_tokens,
+                            num_scheduled_tokens_np, kv_connector_output)
                     if self.debugger is not None:
                         self.debugger.stop()
                         self.debugger.step()
diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py
index 09cd8f7f..0b290a57 100644
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -299,7 +299,7 @@ class NPUWorker(WorkerBase):
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> ModelRunnerOutput | None:
+    ) -> ModelRunnerOutput | AsyncModelRunnerOutput | None:
         # enable msMonitor to monitor the performance of vllm-ascend
         if envs_ascend.MSMONITOR_USE_DAEMON:
             dp.step()
@@ -318,7 +318,8 @@ class NPUWorker(WorkerBase):
 
         output = self.model_runner.execute_model(scheduler_output,
                                                  intermediate_tensors)
-        if isinstance(output, (ModelRunnerOutput, NoneType)):
+        if isinstance(output,
+                      (ModelRunnerOutput, AsyncModelRunnerOutput, NoneType)):
             return output
 
         assert isinstance(output, IntermediateTensors)