diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
index 01ce509f..5404cfb0 100644
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -110,7 +110,7 @@ jobs:
 
       - name: Upload timing data
         uses: actions/upload-artifact@v4
-        if: ${{ inputs.continue_on_error == true }}
+        if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }}
         with:
           name: timing-data-singlecard-light-part${{ matrix.part }}
           path: test_timing_data.json
@@ -200,7 +200,7 @@ jobs:
 
       - name: Upload timing data
         uses: actions/upload-artifact@v4
-        if: ${{ inputs.continue_on_error == true }}
+        if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }}
         with:
           name: timing-data-singlecard-full-part${{ matrix.part }}
           path: test_timing_data.json
@@ -289,7 +289,7 @@ jobs:
 
       - name: Upload timing data
         uses: actions/upload-artifact@v4
-        if: ${{ inputs.continue_on_error == true }}
+        if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }}
         with:
           name: timing-data-2card-light-part${{ matrix.part }}
           path: test_timing_data.json
@@ -378,7 +378,7 @@ jobs:
 
       - name: Upload timing data
         uses: actions/upload-artifact@v4
-        if: ${{ inputs.continue_on_error == true }}
+        if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }}
         with:
           name: timing-data-2card-full-part${{ matrix.part }}
           path: test_timing_data.json
@@ -475,7 +475,7 @@ jobs:
 
       - name: Upload timing data
         uses: actions/upload-artifact@v4
-        if: ${{ inputs.continue_on_error == true }}
+        if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }}
         with:
           name: timing-data-4card-full-part${{ matrix.part }}
           path: test_timing_data.json
diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml
index 4bf3fc08..357a1e32 100644
--- a/.github/workflows/bot_pr_create.yaml
+++ b/.github/workflows/bot_pr_create.yaml
@@ -37,7 +37,7 @@ jobs:
     steps:
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=4034c3d32e30d01639459edd3ab486f56993876d
+          VLLM_COMMIT=4497431df654e46fb1fb5e64bf8611e762ae5d87
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
 
       - name: Checkout repository
diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint
index 64068c64..9116b5a6 100644
--- a/.github/workflows/dockerfiles/Dockerfile.lint
+++ b/.github/workflows/dockerfiles/Dockerfile.lint
@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=4034c3d32e30d01639459edd3ab486f56993876d
+ARG VLLM_COMMIT=4497431df654e46fb1fb5e64bf8611e762ae5d87
 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
     cd /vllm-workspace/vllm && \
     git checkout $VLLM_COMMIT
diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml
index dfa0b74b..b7f35825 100644
--- a/.github/workflows/pr_test_full.yaml
+++ b/.github/workflows/pr_test_full.yaml
@@ -75,7 +75,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
+        vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml
index 76664e0f..0ce86dfa 100644
--- a/.github/workflows/pr_test_light.yaml
+++ b/.github/workflows/pr_test_light.yaml
@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: 4034c3d32e30d01639459edd3ab486f56993876d
+      vllm: 4497431df654e46fb1fb5e64bf8611e762ae5d87
   changes:
     runs-on: linux-aarch64-a2b3-0
     outputs:
@@ -90,7 +90,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     strategy:
       matrix:
-        vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
+        vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -102,7 +102,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
+        vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml
index dd4f2c84..a50a9c16 100644
--- a/.github/workflows/schedule_codecov_refresh.yaml
+++ b/.github/workflows/schedule_codecov_refresh.yaml
@@ -33,7 +33,7 @@ jobs:
     name: refresh codecov
     strategy:
       matrix:
-        vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d]
+        vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md
index 65bcad44..9bab96b9 100644
--- a/docs/source/community/versioning_policy.md
+++ b/docs/source/community/versioning_policy.md
@@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
 
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | 4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
+|     main    | 4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
 
 ## Release cadence
 
diff --git a/tests/ut/eplb/core/test_eplb_utils.py b/tests/ut/eplb/core/test_eplb_utils.py
index 1265ddba..f5388680 100644
--- a/tests/ut/eplb/core/test_eplb_utils.py
+++ b/tests/ut/eplb/core/test_eplb_utils.py
@@ -9,7 +9,6 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig, FusedMoE
 
 from vllm_ascend.ascend_config import init_ascend_config
 from vllm_ascend.eplb.core.eplb_utils import init_eplb_config
-from vllm_ascend.utils import vllm_version_is
 # isort: on
 
 
@@ -22,38 +21,22 @@ class TestAscendConfig(unittest.TestCase):
             "eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2},
         }
         from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
-        if vllm_version_is("0.16.0"):
-            moe_parallel_config = FusedMoEParallelConfig(
-                2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", is_sequence_parallel=True, enable_eplb=True)
-            moe_config = FusedMoEConfig(
-                num_experts=8,
-                experts_per_token=8,
-                hidden_dim=8192,
-                intermediate_size_per_partition=5,
-                num_local_experts=8,
-                activation="silu",
-                device="npu",
-                routing_method=RoutingMethodType.Simulated,
-                moe_parallel_config=moe_parallel_config,
-                in_dtype=torch.float16,
-            )
-        else:
-            moe_parallel_config = FusedMoEParallelConfig(
-                2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl",
-                enable_eplb=True)
-            moe_config = FusedMoEConfig(
-                num_experts=8,
-                experts_per_token=8,
-                hidden_dim=8192,
-                intermediate_size_per_partition=5,
-                num_local_experts=8,
-                num_logical_experts=8,
-                activation="silu",
-                device="npu",
-                routing_method=RoutingMethodType.Simulated,
-                moe_parallel_config=moe_parallel_config,
-                in_dtype=torch.float16,
-            )
+        moe_parallel_config = FusedMoEParallelConfig(
+            2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl",
+            enable_eplb=True)
+        moe_config = FusedMoEConfig(
+            num_experts=8,
+            experts_per_token=8,
+            hidden_dim=8192,
+            intermediate_size_per_partition=5,
+            num_local_experts=8,
+            num_logical_experts=8,
+            activation="silu",
+            device="npu",
+            routing_method=RoutingMethodType.Simulated,
+            moe_parallel_config=moe_parallel_config,
+            in_dtype=torch.float16,
+        )
         moe_config.supports_eplb = True
         self.vllm_config = vllm_config
         self.moe_config = moe_config
diff --git a/vllm_ascend/_310p/model_runner_310p.py b/vllm_ascend/_310p/model_runner_310p.py
index 9e1e3985..19e3acde 100644
--- a/vllm_ascend/_310p/model_runner_310p.py
+++ b/vllm_ascend/_310p/model_runner_310p.py
@@ -152,6 +152,7 @@ class NPUModelRunner310(NPUModelRunner):
         remove_lora: bool = True,
         is_graph_capturing: bool = False,
         num_active_loras: int = 0,
+        profile_seq_lens: int | None = None,
     ):
         temporary_context = self.temporary_modify_uniform_decode_query_len() if uniform_decode else nullcontext()
         with temporary_context:
@@ -168,6 +169,7 @@ class NPUModelRunner310(NPUModelRunner):
                 remove_lora=remove_lora,
                 is_graph_capturing=is_graph_capturing,
                 num_active_loras=num_active_loras,
+                profile_seq_lens=profile_seq_lens,
             )
 
     def _check_and_update_cudagraph_mode(
diff --git a/vllm_ascend/compilation/compiler_interface.py b/vllm_ascend/compilation/compiler_interface.py
index 22b6f8a1..d82259e5 100644
--- a/vllm_ascend/compilation/compiler_interface.py
+++ b/vllm_ascend/compilation/compiler_interface.py
@@ -17,6 +17,7 @@
 #
 import copy
 import functools
+import logging
 from collections.abc import Callable
 from typing import Any
 
@@ -31,7 +32,9 @@ from vllm.config import VllmConfig
 from vllm.config.utils import Range
 
 from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
-from vllm_ascend.utils import COMPILATION_PASS_KEY
+from vllm_ascend.utils import COMPILATION_PASS_KEY, vllm_version_is
+
+logger = logging.getLogger(__name__)
 
 
 def compile_fx(graph: GraphModule, example_inputs: list, inner_compile: Callable, decompositions: dict) -> Callable:
@@ -83,6 +86,11 @@ def npugraph_ex_compile(
     config.mode = "reduce-overhead"
     # execute FX graph in eager mode before graph mode to optimize FX graph.
     config.debug.run_eagerly = True
+    if not vllm_version_is("0.17.0"):
+        # This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
+        # Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
+        # and cause copy_between_host_and_device error.
+        config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
     if ascend_compilation_config.enable_static_kernel:
         config.experimental_config.aclgraph._aclnn_static_shape_kernel = True
         # According to the cudagraph_capture_size configuration, set the shapes
@@ -134,6 +142,22 @@ class AscendCompiler(CompilerInterface):
         # see https://github.com/pytorch/pytorch/issues/138980
         graph = copy.deepcopy(graph)
 
+        if not vllm_version_is("0.17.0"):
+            from torch._guards import detect_fake_mode
+
+            current_fake_mode = detect_fake_mode()
+            if current_fake_mode is not None:
+                example_inputs = [
+                    current_fake_mode.from_tensor(inp)
+                    if (
+                        isinstance(inp, torch.Tensor)
+                        and hasattr(inp, "fake_mode")
+                        and inp.fake_mode is not current_fake_mode
+                    )
+                    else inp
+                    for inp in example_inputs
+                ]
+
         ascend_compilation_config = get_ascend_config().ascend_compilation_config
         if ascend_compilation_config.enable_npugraph_ex:
             assert hasattr(self, "vllm_config")
diff --git a/vllm_ascend/compilation/passes/muls_add_pass.py b/vllm_ascend/compilation/passes/muls_add_pass.py
index 0a379d17..74612a7a 100644
--- a/vllm_ascend/compilation/passes/muls_add_pass.py
+++ b/vllm_ascend/compilation/passes/muls_add_pass.py
@@ -18,17 +18,12 @@ from __future__ import annotations
 
 import torch
 from torch._inductor.pattern_matcher import PatternMatcherPass
+from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
 from vllm.config import VllmConfig
 from vllm.config.compilation import Range
 from vllm.logger import logger
 
 from vllm_ascend.compilation.passes.base_pattern import BasePattern
-from vllm_ascend.utils import vllm_version_is
-
-if vllm_version_is("0.15.0"):
-    from vllm.compilation.vllm_inductor_pass import VllmInductorPass  # type: ignore
-else:
-    from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
 
 
 class MulsAddPattern(BasePattern):
diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py
index ba7a8f3d..6bda63f0 100644
--- a/vllm_ascend/patch/platform/__init__.py
+++ b/vllm_ascend/patch/platform/__init__.py
@@ -22,6 +22,7 @@ import vllm_ascend.patch.platform.patch_kv_cache_interface  # noqa
 import vllm_ascend.patch.platform.patch_mamba_config  # noqa
 import vllm_ascend.patch.platform.patch_minimax_m2_config  # noqa
 import vllm_ascend.patch.platform.patch_sched_yield  # noqa
+import vllm_ascend.patch.platform.patch_torch_accelerator  # noqa
 
 if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
     import vllm_ascend.patch.platform.patch_multiproc_executor  # noqa
diff --git a/vllm_ascend/patch/platform/patch_multiproc_executor.py b/vllm_ascend/patch/platform/patch_multiproc_executor.py
index 50f74e60..da2c93ac 100644
--- a/vllm_ascend/patch/platform/patch_multiproc_executor.py
+++ b/vllm_ascend/patch/platform/patch_multiproc_executor.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import threading
 import weakref
 from collections import deque
@@ -19,6 +21,8 @@ from vllm.v1.executor.multiproc_executor import (
     set_multiprocessing_worker_envs,
 )
 
+from vllm_ascend.utils import vllm_version_is
+
 
 class AscendMultiprocExecutor(MultiprocExecutor):
     def _init_executor(self) -> None:
@@ -26,7 +30,8 @@ class AscendMultiprocExecutor(MultiprocExecutor):
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
         self.is_failed = False
-        self.shutdown_event = threading.Event()
+        if vllm_version_is("0.17.0"):
+            self.shutdown_event = threading.Event()
         self.failure_callback: FailureCallback | None = None
 
         tensor_parallel_size, pp_parallel_size, pcp_parallel_size = self._get_parallel_sizes()
@@ -66,11 +71,31 @@ class AscendMultiprocExecutor(MultiprocExecutor):
         success = False
         try:
             global_start_rank = self.local_world_size * self.parallel_config.node_rank_within_dp
-            for local_rank in range(self.local_world_size):
-                global_rank = global_start_rank + local_rank
-                is_driver_worker = self._is_driver_worker(global_rank)
-                unready_workers.append(
-                    AscendWorkerProc.make_worker_process(
+            if vllm_version_is("0.17.0"):
+                for local_rank in range(self.local_world_size):
+                    global_rank = global_start_rank + local_rank
+                    is_driver_worker = self._is_driver_worker(global_rank)
+                    unready_workers.append(
+                        AscendWorkerProc.make_worker_process(
+                            vllm_config=self.vllm_config,
+                            local_rank=local_rank,
+                            rank=global_rank,
+                            distributed_init_method=distributed_init_method,
+                            input_shm_handle=scheduler_output_handle,
+                            shared_worker_lock=shared_worker_lock,
+                            is_driver_worker=is_driver_worker,
+                        )
+                    )
+            else:
+                # When using fork, keep track of socket file descriptors that are
+                # inherited by the worker, so that we can close them in subsequent
+                # workers
+                inherited_fds: list[int] | None = [] if context.get_start_method() == "fork" else None
+
+                for local_rank in range(self.local_world_size):
+                    global_rank = global_start_rank + local_rank
+                    is_driver_worker = self._is_driver_worker(global_rank)
+                    unready_worker_handle = AscendWorkerProc.make_worker_process(
                         vllm_config=self.vllm_config,
                         local_rank=local_rank,
                         rank=global_rank,
@@ -78,8 +103,12 @@ class AscendMultiprocExecutor(MultiprocExecutor):
                         input_shm_handle=scheduler_output_handle,
                         shared_worker_lock=shared_worker_lock,
                         is_driver_worker=is_driver_worker,
+                        inherited_fds=inherited_fds,
                     )
-                )
+                    unready_workers.append(unready_worker_handle)
+                    if inherited_fds is not None:
+                        inherited_fds.append(unready_worker_handle.death_writer.fileno())
+                        inherited_fds.append(unready_worker_handle.ready_pipe.fileno())
 
             # Workers must be created before wait_for_ready to avoid
             # deadlock, since worker.init_device() does a device sync.
@@ -124,6 +153,8 @@ class AscendMultiprocExecutor(MultiprocExecutor):
                 for uw in unready_workers:
                     if uw.death_writer is not None:
                         uw.death_writer.close()
+                        if not vllm_version_is("0.17.0"):
+                            uw.death_writer = None
                 self._ensure_worker_termination([uw.proc for uw in unready_workers])
 
         self.output_rank = self._get_output_rank()
@@ -158,38 +189,76 @@ class AscendWorkerProc(WorkerProc):
         input_shm_handle,  # Receive SchedulerOutput
         shared_worker_lock: LockType,
         is_driver_worker: bool = False,
+        inherited_fds: list[int] | None = None,
     ) -> UnreadyWorkerProcHandle:
         context = get_mp_context()
-        # (reader, writer)
-        reader, writer = context.Pipe(duplex=False)
+        if vllm_version_is("0.17.0"):
+            # (reader, writer)
+            reader, writer = context.Pipe(duplex=False)
 
-        # Create death pipe to detect parent process exit
-        death_reader, death_writer = context.Pipe(duplex=False)
+            # Create death pipe to detect parent process exit
+            death_reader, death_writer = context.Pipe(duplex=False)
 
-        process_kwargs = {
-            "vllm_config": vllm_config,
-            "local_rank": local_rank,
-            "rank": rank,
-            "distributed_init_method": distributed_init_method,
-            "input_shm_handle": input_shm_handle,
-            "ready_pipe": (reader, writer),
-            "death_pipe": death_reader,
-            "shared_worker_lock": shared_worker_lock,
-            "is_driver_worker": is_driver_worker,
-        }
-        # Run EngineCore busy loop in background process.
-        proc = context.Process(
-            target=WorkerProc.worker_main,
-            kwargs=process_kwargs,
-            name=f"VllmWorker-{rank}",
-            daemon=False,
-        )
+            process_kwargs = {
+                "vllm_config": vllm_config,
+                "local_rank": local_rank,
+                "rank": rank,
+                "distributed_init_method": distributed_init_method,
+                "input_shm_handle": input_shm_handle,
+                "ready_pipe": (reader, writer),
+                "death_pipe": death_reader,
+                "shared_worker_lock": shared_worker_lock,
+                "is_driver_worker": is_driver_worker,
+            }
+            # Run EngineCore busy loop in background process.
+            proc = context.Process(
+                target=WorkerProc.worker_main,
+                kwargs=process_kwargs,
+                name=f"VllmWorker-{rank}",
+                daemon=False,
+            )
 
-        proc.start()
-        writer.close()
-        # Keep death_writer open in parent - when parent exits,
-        # death_reader in child will get EOFError
-        return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
+            proc.start()
+            writer.close()
+            # Keep death_writer open in parent - when parent exits,
+            # death_reader in child will get EOFError
+            return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
+        else:
+            # Ready pipe to communicate readiness from child to parent
+            ready_reader, ready_writer = context.Pipe(duplex=False)
+            # Death pipe to let child detect parent process exit
+            death_reader, death_writer = context.Pipe(duplex=False)
+            if inherited_fds is not None:
+                inherited_fds = inherited_fds.copy()
+                inherited_fds.extend((ready_reader.fileno(), death_writer.fileno()))
+            process_kwargs = {
+                "vllm_config": vllm_config,
+                "local_rank": local_rank,
+                "rank": rank,
+                "distributed_init_method": distributed_init_method,
+                "input_shm_handle": input_shm_handle,
+                "ready_pipe": ready_writer,
+                "death_pipe": death_reader,
+                "shared_worker_lock": shared_worker_lock,
+                "is_driver_worker": is_driver_worker,
+                # Have the worker close parent end of this worker's pipes too
+                "inherited_fds": inherited_fds if inherited_fds is not None else [],
+            }
+            # Run EngineCore busy loop in background process.
+            proc = context.Process(
+                target=WorkerProc.worker_main,
+                kwargs=process_kwargs,
+                name=f"VllmWorker-{rank}",
+                daemon=False,
+            )
+
+            proc.start()
+            # Close child ends of pipes here in the parent
+            ready_writer.close()
+            death_reader.close()
+            # Keep death_writer open in parent - when parent exits,
+            # death_reader in child will get EOFError
+            return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer)
 
 
 vllm.v1.executor.multiproc_executor.MultiprocExecutor = AscendMultiprocExecutor
diff --git a/vllm_ascend/patch/platform/patch_torch_accelerator.py b/vllm_ascend/patch/platform/patch_torch_accelerator.py
new file mode 100644
index 00000000..431dce4e
--- /dev/null
+++ b/vllm_ascend/patch/platform/patch_torch_accelerator.py
@@ -0,0 +1,8 @@
+import torch
+
+
+def patch_empty_cache() -> None:
+    torch.npu.empty_cache()
+
+
+torch.accelerator.empty_cache = patch_empty_cache
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
index 62e7fe80..b9b6c644 100644
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -46,7 +46,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params
 from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel
 from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
-from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled, vllm_version_is
+from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled
 
 # Currently we will fix block size to a small one since `num_reqs` can't be too large
 _PREPARE_INPUTS_BLOCK_SIZE = 4
@@ -615,24 +615,7 @@ class SpecDecodeBaseProposer(EagleProposer):
                 if not self.parallel_drafting:
                     for draft_step in range(1, self.num_speculative_tokens):
                         per_layer_attn_metadata = dict()
-                        if vllm_version_is("0.17.0"):
-                            for attn_group in self.draft_attn_groups:
-                                common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
-                                    draft_step,
-                                    attn_metadata,
-                                    common_attn_metadata,
-                                    batch_size,
-                                    num_input_tokens,
-                                    used_update_positions,
-                                    aclgraph_runtime_mode,
-                                    ori_seq_len,
-                                    slot_indices,
-                                    mtp_slot_mapping,
-                                    attn_group=attn_group,
-                                )
-                                for layer_name in self.attn_layer_names:
-                                    per_layer_attn_metadata[layer_name] = attn_metadata
-                        else:
+                        for attn_group in self.draft_attn_groups:
                             common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
                                 draft_step,
                                 attn_metadata,
@@ -644,6 +627,7 @@ class SpecDecodeBaseProposer(EagleProposer):
                                 ori_seq_len,
                                 slot_indices,
                                 mtp_slot_mapping,
+                                attn_group=attn_group,
                             )
                             for layer_name in self.attn_layer_names:
                                 per_layer_attn_metadata[layer_name] = attn_metadata
@@ -653,21 +637,7 @@ class SpecDecodeBaseProposer(EagleProposer):
             if not self.parallel_drafting:
                 for draft_step in range(1, self.num_speculative_tokens):
                     per_layer_attn_metadata = dict()
-                    if vllm_version_is("0.17.0"):
-                        for attn_group in self.draft_attn_groups:
-                            common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
-                                draft_step,
-                                attn_metadata,
-                                common_attn_metadata,
-                                batch_size,
-                                num_input_tokens,
-                                used_update_positions,
-                                aclgraph_runtime_mode,
-                                attn_group=attn_group,
-                            )
-                            for layer_name in self.attn_layer_names:
-                                per_layer_attn_metadata[layer_name] = attn_metadata
-                    else:
+                    for attn_group in self.draft_attn_groups:
                         common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
                             draft_step,
                             attn_metadata,
@@ -676,6 +646,7 @@ class SpecDecodeBaseProposer(EagleProposer):
                             num_input_tokens,
                             used_update_positions,
                             aclgraph_runtime_mode,
+                            attn_group=attn_group,
                         )
                         for layer_name in self.attn_layer_names:
                             per_layer_attn_metadata[layer_name] = attn_metadata
@@ -1082,16 +1053,11 @@ class SpecDecodeBaseProposer(EagleProposer):
             # 2.
             # Recompute the slot mapping based on the new positions and
             # rejection mask.
-            if vllm_version_is("0.17.0"):
-                # Use the first draft attention group's kv_cache_spec for block_size
-                # (all draft layers share the same kv-cache group)
-                assert len(self.draft_attn_groups) > 0
-                block_size = self.draft_attn_groups[0].kv_cache_spec.block_size
-            else:
-                if self.attn_metadata_builder is None:
-                    block_size = self._get_attention_metadata_builder().kv_cache_spec.block_size
-                else:
-                    block_size = self.attn_metadata_builder.kv_cache_spec.block_size
+            # Use the first draft attention group's kv_cache_spec for block_size
+            # (all draft layers share the same kv-cache group)
+            assert len(self.draft_attn_groups) > 0
+            block_size = self.draft_attn_groups[0].kv_cache_spec.block_size
+
             new_slot_mapping = compute_new_slot_mapping(
                 cad=cad,
                 new_positions=self.positions[:total_num_output_tokens],
@@ -1130,8 +1096,7 @@ class SpecDecodeBaseProposer(EagleProposer):
         attn_group=None,
     ):
         assert draft_step > 0
-        if vllm_version_is("0.17.0"):
-            assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group"
+        assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group"
         common_attn_metadata = self.shallow_copy_metadata(old_common_metadata)
 
         if draft_step == 1:
@@ -1243,13 +1208,7 @@ class SpecDecodeBaseProposer(EagleProposer):
             # Set the address of the attn_metadata.slot_mapping to the self.slot_mapping_group[idx]
             common_attn_metadata.slot_mapping = self.slot_mapping_group[draft_step]
 
-        if vllm_version_is("0.17.0"):
-            attn_metadata_builder = attn_group.get_metadata_builder()
-        else:
-            if self.attn_metadata_builder is None:
-                attn_metadata_builder = self._get_attention_metadata_builder()
-            else:
-                attn_metadata_builder = self.attn_metadata_builder
+        attn_metadata_builder = attn_group.get_metadata_builder()
 
         attn_metadata = attn_metadata_builder.build_for_drafting(
             common_attn_metadata=common_attn_metadata,
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index f2ab3074..96f0f78b 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -412,15 +412,14 @@ class NPUModelRunner(GPUModelRunner):
         self.cpu_slot_mapping = None
         self.sampling_done_event: torch.npu.Event | None = None
 
-        if vllm_version_is("0.17.0"):
-            # self.cudagraph_batch_sizes sorts in ascending order.
-            if (
-                self.compilation_config.cudagraph_capture_sizes
-                and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            ):
-                self.cudagraph_batch_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
-            else:
-                self.cudagraph_batch_sizes = []
+        # self.cudagraph_batch_sizes sorts in ascending order.
+        if (
+            self.compilation_config.cudagraph_capture_sizes
+            and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+        ):
+            self.cudagraph_batch_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
+        else:
+            self.cudagraph_batch_sizes = []
         self.mamba_state_idx: dict[str, int] = {}
         self._mamba_copy_bufs: mamba_utils.MambaCopyBuffers | None = None
 
@@ -1376,7 +1375,12 @@ class NPUModelRunner(GPUModelRunner):
                 skip_compiled=has_encoder_input,
             ),
             self.maybe_get_kv_connector_output(
-                scheduler_output, clear_metadata=clear_kv_metadata
+                scheduler_output,
+                **(
+                    {"clear_metadata": clear_kv_metadata}
+                    if vllm_version_is("0.17.0")
+                    else {"defer_finalize": not clear_kv_metadata}
+                ),
             ) as kv_connector_output,
         ):
             hidden_states = self._model_forward(
@@ -2253,6 +2257,7 @@ class NPUModelRunner(GPUModelRunner):
         remove_lora: bool = True,
         is_graph_capturing: bool = False,
         num_active_loras: int = 0,
+        profile_seq_lens: int | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # only support eager mode and piecewise graph now
         assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes()
@@ -2359,11 +2364,14 @@ class NPUModelRunner(GPUModelRunner):
             # seq_lens. We use this seq_len only when capturing graph, and still use max_query_len
             # in inference. This will be removed once npu_fused_infer_attention_score
             # outperforms _npu_paged_attention on all cases.
-            seq_lens = (
-                SEQ_LEN_WITH_MAX_PA_WORKSPACE
-                if is_graph_capturing and using_paged_attention(num_tokens, self.vllm_config)
-                else max_query_len
-            )  # type: ignore[assignment]
+            if profile_seq_lens is not None:
+                seq_lens = profile_seq_lens
+            else:
+                seq_lens = (
+                    SEQ_LEN_WITH_MAX_PA_WORKSPACE
+                    if is_graph_capturing and using_paged_attention(num_tokens, self.vllm_config)
+                    else max_query_len
+                )  # type: ignore[assignment]
             self.seq_lens.np[:num_reqs_padded] = seq_lens
             self.seq_lens.np[num_reqs_padded:] = 0
             self.seq_lens.copy_to_gpu()
@@ -2579,14 +2587,13 @@ class NPUModelRunner(GPUModelRunner):
 
         self.may_reinitialize_input_batch(kv_cache_config)
         kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
-        if vllm_version_is("0.17.0"):
-            # TODO: refactor the logic of attention
-            # Initialize drafter attention group initialization
-            if self.speculative_config and (
-                self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model()
-            ):
-                assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer)
-                self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes)
+        # TODO: refactor the logic of attention
+        # Initialize drafter attention group initialization
+        if self.speculative_config and (
+            self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model()
+        ):
+            assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer)
+            self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes)
 
         if has_kv_transfer_group():
             get_kv_transfer_group().register_kv_caches(kv_caches)
@@ -3031,11 +3038,18 @@ class NPUModelRunner(GPUModelRunner):
             max_num_blocks.append(max_num_blocks_per_req)
 
         if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
-            assert self.cache_config.cpu_offload_gb == 0, (
-                "Cannot re-initialize the input batch when CPU weight "
-                "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 "  # noqa: E501
-                "for more details."
-            )
+            if vllm_version_is("0.17.0"):
+                assert self.cache_config.cpu_offload_gb == 0, (
+                    "Cannot re-initialize the input batch when CPU weight "
+                    "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 "  # noqa: E501
+                    "for more details."
+                )
+            else:
+                assert self.offload_config.uva.cpu_offload_gb == 0, (
+                    "Cannot re-initialize the input batch when CPU weight "
+                    "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 "  # noqa: E501
+                    "for more details."
+                )
             self.input_batch = NPUInputBatch(
                 max_num_reqs=self.max_num_reqs,
                 max_model_len=max_model_len,