adapt to main2main for model runner v2 (#7578)

### What this PR does / why we need it? This PR aims to adapt to newest commit of vllm main branch for model runner v2. please refer to https://github.com/vllm-project/vllm-ascend/issues/5208 ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? - vLLM version: v0.18.0 - vLLM main: ed359c497a --------- Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
2026-03-25 09:08:44 +08:00
parent fc3ec100bc
commit d96440924a
16 changed files with 239 additions and 264 deletions
--- a/vllm_ascend/patch/init.py
+++ b/vllm_ascend/patch/init.py
@@ -312,7 +312,7 @@
 #    Future Plan:
 #       Remove this patch when vLLM aligns with the latest processor implementation.
 #
-# ** 10. File: worker/patch_v2_eagle.py**
+# ** 10. File: worker/patch_v2/patch_eagle.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose`
 #    Why:
@@ -348,7 +348,7 @@
 #    Future Plan:
 #       Remove this patch when the PTA version used by vllm-ascend has been upgraded.
 #
-# ** 13. File: worker/patch_v2_uva.py**
+# ** 13. File: worker/patch_v2/patch_uva.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.worker.gpu.states.UvaBuffer`
 #    Why:
@@ -553,3 +553,48 @@
 #    Future Plan:
 #       The maybe_remap_kv_scale_name function of the community is reconstructed to support
 #       multiple backends.
 # ** 24. File: worker/patch_v2/patch_input_batch.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.worker.gpu.input_batch.InputBatch`
 #    Why:
 #       vllm use InputBatch to make dummy tensors. in `model_runner.py` and `cudagraph_utils.py`
 #       which make it difficult to inherit from vllm methods.
 #    How：
 #       replace InputBatch with AscendInputBatch.
 #    Future Plan:
 #       remove this patch when vLLM-ascend's make_dummy behavior aligns with vLLM.
 # ** 25. File: worker/patch_v2/patch_block_table.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.worker.gpu.block_table.BlockTables`
 #    Why:
 ##      vllm-ascend need to initialize slot mapping as torch.int32 dtype,
 #       but vllm default is torch.int64 dtype.
 #    How：
 #       replace BlockTables with AscendBlockTables which initialize slot mapping
 #       as torch.int32 dtype.
 #    Future Plan:
 #       remove this patch when vLLM-ascend's BlockTables can initialize
 #       slot mapping as torch.int64 dtype.
 # ** 25. File: worker/patch_v2/patch_model_state.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.worker.gpu.model_states.default.init_model_state`
 #    Why:
 ##      vllm's prepare_attn in ModelState is different from vllm,
 #       we need to override init_model_state.
 #    How：
 #       Define AscendModelState and initialize it in init_model_state.
 #    Future Plan:
 #       remove this when vllm-ascend's attention metadata is align with vllm.
 # ** 26. File: worker/patch_v2/patch_triton.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.worker.gpu.sample.logprob`, `vllm.v1.worker.gpu.sample.penalties.apply_penalties`,
 #      `vllm.v1.worker.gpu.sample.gumbel.gumbel_sample`
 #    Why:
 #       triton ops in vLLM perform not good on NPU. And there is no dispatch mechanism for triton ops.
 #    How：
 #       override triton ops in vLLM with ascend implementation
 #    Related PR (if no, explain why):
 #       Let vLLM support triton ops dispatch.
 #    Future Plan:
 #       Remove this patch when vLLM support the dispatch function.
 #
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -19,6 +19,7 @@ from vllm.triton_utils import HAS_TRITON
 if HAS_TRITON:
    import vllm_ascend.patch.worker.patch_triton
    import vllm_ascend.patch.worker.patch_v2.patch_triton  # noqa
 # isort: off
@@ -36,8 +37,8 @@ import vllm_ascend.patch.worker.patch_qwen3_next  # noqa
 import vllm_ascend.patch.worker.patch_qwen3_next_mtp  # noqa
 import vllm_ascend.patch.worker.patch_qwen3_5  # noqa
 import vllm_ascend.patch.worker.patch_rejection_sampler  # noqa
-import vllm_ascend.patch.worker.patch_v2_eagle  # noqa
+import vllm_ascend.patch.worker.patch_v2.patch_eagle  # noqa
-import vllm_ascend.patch.worker.patch_v2_uva  # noqa
+import vllm_ascend.patch.worker.patch_v2.patch_uva  # noqa
 import vllm_ascend.patch.worker.patch_huanyuan_vl  # noqa
 import vllm_ascend.patch.worker.patch_routed_experts_capturer  # noqa
 import vllm_ascend.patch.worker.patch_npugraph_ex_triton  # noqa
@@ -45,3 +46,6 @@ import vllm_ascend.patch.worker.patch_kimi_k25  # noqa
 import vllm_ascend.patch.worker.patch_draft_quarot  # noqa
 import vllm_ascend.patch.worker.patch_cudagraph  # noqa
 import vllm_ascend.patch.worker.patch_deepseek_mtp  # noqa
 import vllm_ascend.patch.worker.patch_v2.patch_input_batch  # noqa
 import vllm_ascend.patch.worker.patch_v2.patch_model_state  # noqa
 import vllm_ascend.patch.worker.patch_v2.patch_block_table  # noqa
--- a/vllm_ascend/patch/worker/patch_triton.py
+++ b/vllm_ascend/patch/worker/patch_triton.py
@@ -5,7 +5,6 @@ from vllm_ascend.ops.triton.fla.chunk import chunk_gated_delta_rule
 from vllm_ascend.ops.triton.fla.layernorm_guard import LayerNormFn
 from vllm_ascend.ops.triton.fla.sigmoid_gating import fused_recurrent_gated_delta_rule_fwd_kernel
 from vllm_ascend.ops.triton.mamba.causal_conv1d import causal_conv1d_fn, causal_conv1d_update_npu
 from vllm_ascend.worker.v2.sample.gumbel import gumbel_sample as ascend_gumbel_sample
 vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal_conv1d_update_npu
 vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn
@@ -14,4 +13,3 @@ vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_r
 )
 vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn
 vllm.model_executor.layers.fla.ops.chunk_gated_delta_rule = chunk_gated_delta_rule
 vllm.v1.worker.gpu.sample.gumbel.gumbel_sample = ascend_gumbel_sample
--- a/vllm_ascend/patch/worker/patch_v2/init.py
+++ b/vllm_ascend/patch/worker/patch_v2/init.py
--- a/vllm_ascend/patch/worker/patch_v2/patch_block_table.py
+++ b/vllm_ascend/patch/worker/patch_v2/patch_block_table.py
@@ -0,0 +1,25 @@
 # Adapt from https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu/block_table.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 from vllm.v1.worker.gpu import model_runner
 from vllm_ascend.worker.v2.block_table import AscendBlockTables
 # vllm-ascend need to initialize slot mapping as torch.int32 dtype,
 # but vllm default is torch.int64 dtype.
 model_runner.BlockTables = AscendBlockTables
--- a/vllm_ascend/patch/worker/patch_v2/patch_eagle.py
+++ b/vllm_ascend/patch/worker/patch_v2/patch_eagle.py
--- a/vllm_ascend/patch/worker/patch_v2/patch_input_batch.py
+++ b/vllm_ascend/patch/worker/patch_v2/patch_input_batch.py
@@ -0,0 +1,27 @@
 # Adapt from https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu/input_batch.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 # 显式导入模块，确保模块被加载后再进行 patch
 from vllm.v1.worker.gpu import cudagraph_utils, model_runner
 from vllm_ascend.worker.v2.input_batch import AscendInputBatch
 cudagraph_utils.InputBatch = AscendInputBatch
 model_runner.InputBatch = AscendInputBatch
--- a/vllm_ascend/patch/worker/patch_v2/patch_model_state.py
+++ b/vllm_ascend/patch/worker/patch_v2/patch_model_state.py
@@ -0,0 +1,26 @@
 # Adapt from https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu/model_states/default.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 from vllm.v1.worker.gpu import model_runner
 from vllm_ascend.worker.v2.model_states import init_asecnd_model_state
 # prepare_attn in AscendModelState is different from vllm,
 # we need to override init_model_state.
 model_runner.init_model_state = init_asecnd_model_state
--- a/vllm_ascend/patch/worker/patch_v2/patch_triton.py
+++ b/vllm_ascend/patch/worker/patch_v2/patch_triton.py
@@ -0,0 +1,12 @@
 from vllm.v1.worker.gpu import input_batch
 from vllm.v1.worker.gpu.sample import gumbel, logprob, penalties
 from vllm_ascend.worker.v2.input_batch import post_update
 from vllm_ascend.worker.v2.sample.gumbel import gumbel_sample
 from vllm_ascend.worker.v2.sample.logprob import compute_token_logprobs
 from vllm_ascend.worker.v2.sample.penalties import apply_penalties
 logprob.compute_token_logprobs = compute_token_logprobs
 penalties.apply_penalties = apply_penalties
 gumbel.gumbel_sample = gumbel_sample
 input_batch.post_update = post_update
--- a/vllm_ascend/patch/worker/patch_v2/patch_uva.py
+++ b/vllm_ascend/patch/worker/patch_v2/patch_uva.py
--- a/vllm_ascend/worker/v2/README.md
+++ b/vllm_ascend/worker/v2/README.md
@@ -5,5 +5,5 @@ This directory contains the new model runner which is under active development.
 please see [Model Runner V2](https://github.com/vllm-project/vllm-ascend/issues/5208)
 to get specific plans.
-supported vllm version: main@4034c3d32e30d01639459edd3ab486f56993876d
+supported vllm version: main@ed359c497a728f08b5b41456c07a688ccd510fbc
-related PR: <https://github.com/vllm-project/vllm-ascend/pull/7110>
+related PR: <https://github.com/vllm-project/vllm-ascend/pull/7598>
--- a/vllm_ascend/worker/v2/aclgraph_utils.py
+++ b/vllm_ascend/worker/v2/aclgraph_utils.py
@@ -16,128 +16,68 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 from contextlib import contextmanager
 from typing import Any
 import numpy as np
 import torch
 import torch.nn as nn
 import vllm
 from vllm.config import VllmConfig
 from vllm.config.compilation import CUDAGraphMode
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import logger
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.worker.gpu.attn_utils import build_slot_mappings_by_layer
 from vllm.v1.worker.gpu.block_table import BlockTables
-from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager
+from vllm.v1.worker.gpu.cudagraph_utils import BatchExecutionDescriptor, ModelCudaGraphManager
 from vllm.v1.worker.gpu.input_batch import InputBuffers
 from vllm.v1.worker.gpu.model_states.interface import ModelState
 from vllm.v1.worker.utils import AttentionGroup
 from vllm_ascend.ascend_forward_context import _EXTRA_CTX
 from vllm_ascend.compilation.acl_graph import set_graph_params, update_full_graph_params
 from vllm_ascend.worker.v2.attn_utils import build_attn_metadata
 from vllm_ascend.worker.v2.utils import torch_cuda_wrapper
-class AclGraphManager(CudaGraphManager):
+class ModelAclGraphManager(ModelCudaGraphManager):
-    """ACL Graph Manager for Ascend NPUs."""
+    """ACL Model Cuda Graph Manager for Ascend NPUs."""
    def __init__(
        self,
        vllm_config: VllmConfig,
        use_aux_hidden_state_outputs: bool,
        device: torch.device,
-        model_runner: Any,  # NPUModelRunner type, in case circular import, so we pass it as Any
+        cudagraph_mode: CUDAGraphMode,
        decode_query_len: int,
        model_runner: Any,
    ):
        super().__init__(
            vllm_config,
            device,
            cudagraph_mode,
            decode_query_len,
        )
        # set model runner attribute, so we can access attributes model runner
        # when call `run_fullgraph` method in CudaGraphManager,
        # then we don't need to # copy `execute_model` method in `NPUModelRunner` class.
        self.model_runner = model_runner
-        super().__init__(
+        # capture_sizes sorts in ascending order.
-            vllm_config,
+        self.capture_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
            use_aux_hidden_state_outputs,
            device,
        )
        # vllm-ascend need to update graph params of attention backend.
        # so we need to set graph params before capture full graph.
        if super().needs_capture():
-            set_graph_params(self.cudagraph_sizes)
+            set_graph_params(self.capture_sizes)
-    def _capture_full_graph(
+    def run_fullgraph(self, desc: BatchExecutionDescriptor) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:
        self,
        num_tokens: int,
        num_reqs: int,
        model: nn.Module,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        inputs_embeds: torch.Tensor | None,
        num_tokens_across_dp: torch.Tensor,
        attn_metadata: dict[str, Any] | None,
        slot_mappings: dict[str, torch.Tensor] | None,
        has_lora: bool = False,
    ) -> None:
        """Override _capture_full_graph because we need to set capturing=True in forward context."""
        # set capturing=True in before model forward.
        model = ModelWithContext(model)
        return super()._capture_full_graph(
            num_tokens,
            num_reqs,
            model,
            input_ids,
            positions,
            inputs_embeds,
            num_tokens_across_dp,
            attn_metadata,
            slot_mappings,
            has_lora,
        )
    def capture_graph(
        self,
        num_tokens: int,
        capture_cg_mode: CUDAGraphMode,
        model: nn.Module,
        model_state: ModelState,
        input_buffers: InputBuffers,
        block_tables: BlockTables,
        attn_groups: list[list[AttentionGroup]],
        kv_cache_config: KVCacheConfig,
        has_lora: bool = False,
        uniform_decode: bool = False,
    ) -> None:
        with torch_cuda_wrapper(), prepare_capture_inputs_wrapper():
            super().capture_graph(
                num_tokens,
                capture_cg_mode,
                model,
                model_state,
                input_buffers,
                block_tables,
                attn_groups,
                kv_cache_config,
                has_lora,
                uniform_decode,
            )
    def run_fullgraph(self, num_tokens: int) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:
        """Override run_fullgraph to update full graph params in run_fullgraph."""
        num_tokens = desc.num_tokens
        logger.info_once(f"run_fullgraph with num_tokens={num_tokens}")
-        ret = super().run_fullgraph(num_tokens)
+        ret = super().run_fullgraph(desc)
        assert self.model_runner.cudagraph_and_dp_padding is not None
        positions = self.model_runner.input_buffers.positions[:num_tokens]
-        _num_tokens_after_padding, num_tokens_across_dp, synced_cudagraph_mode = (
+        # refer to vllm.v1.worker.gpu.dp_utils.sync_cudagraph_and_dp_padding to
-            self.model_runner.cudagraph_and_dp_padding
+        # calculate num_tokens_across_dp.
-        )
+        num_tokens_across_dp = torch.full([self.model_runner.dp_size], num_tokens, device=self.device)
        cudagraph_runtime_mode = CUDAGraphMode(synced_cudagraph_mode)
        with set_forward_context(
            self.model_runner.input_batch.attn_metadata,
            self.vllm_config,
            num_tokens=num_tokens,
-            cudagraph_runtime_mode=cudagraph_runtime_mode,
+            cudagraph_runtime_mode=desc.cg_mode,
            num_tokens_across_dp=num_tokens_across_dp,
            batch_descriptor=None,  # Full graph model don't need batch_descriptor
            slot_mapping=self.model_runner.input_batch.slot_mappings,
@@ -155,79 +95,31 @@ class AclGraphManager(CudaGraphManager):
            )
        return ret
-    def is_uniform_decode(
+    def capture(
        self,
-        num_reqs: int,
+        model: nn.Module,
-        num_tokens: int,
+        model_state: ModelState,
        max_query_len: int,
    ):
        return (max_query_len == self.uniform_decode_query_len) and (num_tokens == max_query_len * num_reqs)
@contextmanager
 def prepare_capture_inputs_wrapper():
    """Context manager to override input preparation for NPU graph capture."""
    # TODO(Ronald1995): make prepare_inputs_to_capture as static method
    # in CudaGraphManager.
    ori = vllm.v1.worker.gpu.cudagraph_utils.prepare_inputs_to_capture
    try:
        vllm.v1.worker.gpu.cudagraph_utils.prepare_inputs_to_capture = prepare_inputs_to_capture
        yield
    finally:
        vllm.v1.worker.gpu.cudagraph_utils.prepare_inputs_to_capture = ori
 def prepare_inputs_to_capture(
    num_reqs: int,
    num_tokens: int,
        input_buffers: InputBuffers,
        block_tables: BlockTables,
        attn_groups: list[list[AttentionGroup]],
    max_model_len: int,
        kv_cache_config: KVCacheConfig,
-    uniform_decode_query_len: int = 0,
+        has_lora: bool = False,
-) -> tuple[dict[str, Any], dict[str, torch.Tensor]]:
+        use_aux_hidden_state_outputs: bool = False,
-    if uniform_decode_query_len > 0:
+        progress_bar_desc: str = "Capturing CUDA graphs",
-        num_tokens_per_req = uniform_decode_query_len
+    ) -> None:
-    else:
+        """Capture CUDA graphs for model forward pass."""
-        num_tokens_per_req = num_tokens // num_reqs
+        model = ModelWithContext(model)
-
+        return super().capture(
-    query_start_loc_np = np.arange(num_reqs + 1, dtype=np.int32) * num_tokens_per_req
+            model,
-    query_start_loc_np[-1] = num_tokens
+            model_state,
-    query_start_loc_cpu = torch.from_numpy(query_start_loc_np)
+            input_buffers,
-    input_buffers.query_start_loc[: num_reqs + 1] = query_start_loc_cpu
+            block_tables,
-    input_buffers.query_start_loc[num_reqs + 1 :] = num_tokens
+            attn_groups,
-    query_start_loc = input_buffers.query_start_loc[: num_reqs + 1]
+            kv_cache_config,
-
+            has_lora,
-    # HACK(woosuk): For faster warmup, we set seq_lens (GPU) to num_tokens
+            use_aux_hidden_state_outputs,
-    # rather than max_model_len.
+            progress_bar_desc,
    input_buffers.seq_lens[:num_reqs] = num_tokens
    input_buffers.seq_lens[num_reqs:] = 0
    input_buffers.seq_lens_cpu[:num_reqs] = num_tokens
    input_buffers.seq_lens_cpu[num_reqs:] = 0
    input_buffers.dcp_local_seq_lens[:num_reqs] = num_tokens
    input_buffers.dcp_local_seq_lens[num_reqs:] = 0
    input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables]
    slot_mappings = block_tables.slot_mappings[:, :num_tokens]
    slot_mappings_by_layer = build_slot_mappings_by_layer(slot_mappings, kv_cache_config)
    attn_metadata = build_attn_metadata(
        attn_groups=attn_groups,
        num_reqs=num_reqs,
        num_tokens=num_tokens,
        query_start_loc_gpu=query_start_loc,
        query_start_loc_cpu=query_start_loc_cpu,
        max_query_len=num_tokens_per_req,
        seq_lens=input_buffers.seq_lens,
        max_seq_len=max_model_len,
        block_tables=input_block_tables,
        slot_mappings=slot_mappings,
        kv_cache_config=kv_cache_config,
        seq_lens_np=input_buffers.seq_lens_np,
        )
    return attn_metadata, slot_mappings_by_layer
 class ModelWithContext(nn.Module):
@@ -242,6 +134,7 @@ class ModelWithContext(nn.Module):
    def forward(self, *args, **kwargs):
        # In warmup phase, capturing=False by default.
        # when capturing, we need to set capturing=True in forward context.
        if torch.npu.is_current_stream_capturing():
            _EXTRA_CTX.capturing = True
        return self.original_model(*args, **kwargs)
--- a/vllm_ascend/worker/v2/input_batch.py
+++ b/vllm_ascend/worker/v2/input_batch.py
@@ -79,14 +79,12 @@ class AscendInputBatch(InputBatch):
        num_reqs: int,
        num_tokens: int,
        input_buffers: AscendInputBuffers,
        device: torch.device,
    ) -> "AscendInputBatch":
        """Override the make_dummy method to calculate seq_lens_np."""
        input_batch = InputBatch.make_dummy(
            num_reqs,
            num_tokens,
            input_buffers,
            device,
        )
        # seq_len equals to query_len
        input_buffers.seq_lens_np[:num_reqs] = num_tokens // num_reqs
--- a/vllm_ascend/worker/v2/model_runner.py
+++ b/vllm_ascend/worker/v2/model_runner.py
@@ -17,17 +17,13 @@
 # This file is a part of the vllm-ascend project.
 #
 import functools
 import numpy as np
 import torch
 import vllm
 from vllm.config import VllmConfig
 from vllm.config.compilation import CUDAGraphMode
 from vllm.sequence import IntermediateTensors
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu.buffer_utils import async_copy_to_gpu
 from vllm.v1.worker.gpu.cudagraph_utils import BatchExecutionDescriptor
 from vllm.v1.worker.gpu.input_batch import (
    combine_sampled_and_draft_tokens,
    expand_idx_mapping,
@@ -38,21 +34,21 @@ from vllm.v1.worker.gpu.model_runner import GPUModelRunner
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.utils import set_weight_prefetch_method
-from vllm_ascend.worker.v2.aclgraph_utils import AclGraphManager
+from vllm_ascend.worker.v2.aclgraph_utils import ModelAclGraphManager
 from vllm_ascend.worker.v2.attn_utils import build_attn_state
 from vllm_ascend.worker.v2.input_batch import AscendInputBatch, AscendInputBuffers
 from vllm_ascend.worker.v2.sample.sampler import AscendSampler
 from vllm_ascend.worker.v2.spec_decode import init_speculator
 from vllm_ascend.worker.v2.spec_decode.eagle import AscendEagleSpeculator
 from vllm_ascend.worker.v2.states import AscendRequestState
-from vllm_ascend.worker.v2.utils import block_table_wrapper, model_states_wrapper, torch_cuda_wrapper
+from vllm_ascend.worker.v2.utils import torch_cuda_wrapper
 class NPUModelRunner(GPUModelRunner):
    """Model runner for Ascend NPUs."""
    def __init__(self, vllm_config: VllmConfig, device: torch.device):
-        with torch_cuda_wrapper(), block_table_wrapper(), model_states_wrapper():
+        with torch_cuda_wrapper():
            super().__init__(vllm_config, device)
        # because we will override these attribute, delete these attribute to
@@ -64,11 +60,12 @@ class NPUModelRunner(GPUModelRunner):
        del self.speculator
        # NPU specific initializations can be added below.
-        self.cudagraph_manager: AclGraphManager = AclGraphManager(
+        self.cudagraph_manager: ModelAclGraphManager = ModelAclGraphManager(
            self.vllm_config,
            self.use_aux_hidden_state_outputs,
            self.device,
-            self,
+            self.compilation_config.cudagraph_mode,
            decode_query_len=self.decode_query_len,
            model_runner=self,
        )
        # we define AscendEagleSpeculator in vllm_ascend.worker.v2.spec_decode.eagle
@@ -138,50 +135,17 @@ class NPUModelRunner(GPUModelRunner):
        # so we can inherit `execute_model` method.
        self.input_batch: AscendInputBatch | None = None
    @torch.inference_mode()
    def execute_model(
        self,
        scheduler_output: SchedulerOutput,
        intermediate_tensors: IntermediateTensors | None = None,
        dummy_run: bool = False,
        skip_attn_for_dummy_run: bool = False,
    ) -> ModelRunnerOutput | IntermediateTensors | None:
        """Override GPUModelRunner.execute_model for Ascend NPUs by there reasons:
        1. when run fullgraph, we need to use ret value of `get_cudagraph_and_dp_padding`
        to set forward_context in `run_fullgraph`.
        """
        # use closure to store return value of get_cudagraph_and_dp_padding in model runner.
        def wrapper(func):
            @functools.wraps(func)
            def inner(*args, **kwargs):
                self.cudagraph_and_dp_padding = func(*args, **kwargs)
                return self.cudagraph_and_dp_padding
            return inner
        if self.cudagraph_and_dp_padding is None:
            vllm.v1.worker.gpu.model_runner.get_cudagraph_and_dp_padding = wrapper(
                vllm.v1.worker.gpu.model_runner.get_cudagraph_and_dp_padding
            )
        return super().execute_model(
            scheduler_output,
            intermediate_tensors,
            dummy_run,
            skip_attn_for_dummy_run,
        )
    def prepare_inputs(
        self,
        scheduler_output: SchedulerOutput,
-        num_tokens_after_padding: int,
+        batch_desc: BatchExecutionDescriptor,
    ) -> AscendInputBatch:
        """Override GPUModelRunner.prepare_inputs for Ascend NPUs.
        npu attention backends need seq_lens_cpu to work.
        so we need to prepare seq_lens_cpu here.
        """
        num_tokens = scheduler_output.total_num_scheduled_tokens
        num_tokens_after_padding = batch_desc.num_tokens
        assert num_tokens > 0
        num_tokens_per_req = scheduler_output.num_scheduled_tokens
        num_reqs = len(num_tokens_per_req)
@@ -247,6 +211,7 @@ class NPUModelRunner(GPUModelRunner):
        # Get query_start_loc.
        # NOTE: For FULL mode we change +1 to +2 to reserve extra space for padding.
        # See _pad_query_start_loc_for_fia.
        num_reqs_padded = batch_desc.num_reqs or num_reqs
        query_start_loc_np = np.empty(self.max_num_reqs + 2, dtype=np.int32)
        query_start_loc_np[0] = 0
        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1 : num_reqs + 1])
@@ -256,11 +221,12 @@ class NPUModelRunner(GPUModelRunner):
        # This is only required for vllm-ascend.
        query_start_loc_np, num_reqs_padded = self._pad_query_start_loc_for_fia(
-            num_tokens_padded=num_tokens_after_padding,
+            num_tokens_after_padding,
-            num_tokens=num_tokens,
+            num_reqs_padded,
-            num_reqs=num_reqs,
+            num_reqs,
-            query_start_loc_np=query_start_loc_np,
+            query_start_loc_np,
-            max_query_len=max(scheduler_output.num_scheduled_tokens.values()),
+            batch_desc.cg_mode,
            batch_desc.num_reqs,
        )
        async_copy_to_gpu(query_start_loc_np, out=self.input_buffers.query_start_loc)
@@ -311,7 +277,8 @@ class NPUModelRunner(GPUModelRunner):
        self.input_batch = AscendInputBatch(
            req_ids=req_ids,
-            num_reqs=num_reqs_padded,
+            num_reqs=num_reqs,
            num_reqs_after_padding=num_reqs_padded,
            idx_mapping=idx_mapping,
            idx_mapping_np=idx_mapping_np,
            expanded_idx_mapping=expanded_idx_mapping,
@@ -394,37 +361,34 @@ class NPUModelRunner(GPUModelRunner):
    def _pad_query_start_loc_for_fia(
        self,
        num_tokens_padded: int,
-        num_tokens: int,
+        num_reqs_padded: int,
        num_reqs: int,
        query_start_loc_np: np.ndarray,
-        max_query_len: int,
+        cudagraph_runtime_mode: CUDAGraphMode | None = None,
        batch_desc_num_reqs: int | None = None,
    ) -> tuple[np.ndarray, int]:
        """
        This function is only designed to satisfied the constraint that when the layout is TND,
        the first dimension of `hidden_states` must equal the last element of `actual_seq_lengths_q`.
        """
-        assert self.cudagraph_and_dp_padding is not None
+        # TODO: need refactor later, related to vllm PR #34043 this pr delete func
-        _num_tokens_after_padding, _num_tokens_across_dp, synced_cudagraph_mode = self.cudagraph_and_dp_padding
+        # relax_for_mixed_batch_cudagraphs, num_reqs no longer equals the actual number of requests.
-        cudagraph_runtime_mode = CUDAGraphMode(synced_cudagraph_mode)
+        if cudagraph_runtime_mode == CUDAGraphMode.FULL:
-        if cudagraph_runtime_mode != CUDAGraphMode.FULL:
+            num_reqs_padded = num_reqs
-            return query_start_loc_np, num_reqs
+        else:
-        uniform_decode_query_len = self.cudagraph_manager.uniform_decode_query_len
+            num_reqs_padded = batch_desc_num_reqs if batch_desc_num_reqs is not None else num_reqs
-        is_uniform_decode = self.cudagraph_manager.is_uniform_decode(
+
-            num_reqs=num_reqs,
+        if num_tokens_padded == num_reqs_padded * self.decode_query_len:
            num_tokens=num_tokens,
            max_query_len=max_query_len,
        )
        if is_uniform_decode:
            # Uniform-batch case: num_reqs must be no greater than num_reqs_padded
-            num_reqs_padded = num_tokens_padded // uniform_decode_query_len
+            assert num_reqs <= num_reqs_padded
            last_loc = query_start_loc_np[num_reqs]
            query_start_loc_np[num_reqs + 1 : num_reqs_padded + 1] = (
-                np.arange(1, num_reqs_padded + 1 - num_reqs) * uniform_decode_query_len + last_loc
+                np.arange(1, num_reqs_padded + 1 - num_reqs) * self.decode_query_len + last_loc
            )
        else:
            # Mixed-batch case: num_reqs must equal num_reqs_padded
-            num_reqs_padded = min(num_tokens_padded, self.max_num_reqs)
+            assert num_reqs == num_reqs_padded
            # Insert a dummy request instead of setting query_start_loc[num_reqs] = num_tokens_padded directly
            query_start_loc_np[num_reqs_padded + 1] = num_tokens_padded
--- a/vllm_ascend/worker/v2/model_states/default.py
+++ b/vllm_ascend/worker/v2/model_states/default.py
@@ -20,6 +20,7 @@
 from typing import Any
 import torch
 from vllm.config.compilation import CUDAGraphMode
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.worker.gpu.model_states.default import DefaultModelState
 from vllm.v1.worker.utils import AttentionGroup
@@ -34,18 +35,28 @@ class AscendModelState(DefaultModelState):
    def prepare_attn(
        self,
        input_batch: AscendInputBatch,
        cudagraph_mode: CUDAGraphMode,
        block_tables: tuple[torch.Tensor, ...],
        slot_mappings: torch.Tensor,
        attn_groups: list[list[AttentionGroup]],
        kv_cache_config: KVCacheConfig,
        for_capture: bool = False,
    ) -> dict[str, Any]:
        """Override prepare_attn method because `build_attn_metadata` is different from vllm."""
        if cudagraph_mode == CUDAGraphMode.FULL:
            # Use padded sizes - padding is handled by model_runner.prepare_attn.
            num_reqs = input_batch.num_reqs_after_padding
            num_tokens = input_batch.num_tokens_after_padding
        else:
            # For piecewise cudagraphs and eager, use unpadded sizes.
            num_reqs = input_batch.num_reqs
            num_tokens = input_batch.num_tokens
        query_start_loc_cpu = torch.from_numpy(input_batch.query_start_loc_np)
        max_query_len = input_batch.num_scheduled_tokens.max().item()
        attn_metadata = build_attn_metadata(
            attn_groups=attn_groups,
-            num_reqs=input_batch.num_reqs,
+            num_reqs=num_reqs,
-            num_tokens=input_batch.num_tokens,
+            num_tokens=num_tokens,
            query_start_loc_gpu=input_batch.query_start_loc,
            query_start_loc_cpu=query_start_loc_cpu,
            max_query_len=max_query_len,
--- a/vllm_ascend/worker/v2/utils.py
+++ b/vllm_ascend/worker/v2/utils.py
@@ -1,12 +1,8 @@
 from contextlib import contextmanager
 import torch
 import vllm
 from vllm.logger import logger
 from vllm_ascend.worker.v2.block_table import AscendBlockTables
 from vllm_ascend.worker.v2.model_states import init_asecnd_model_state
@contextmanager
 def torch_cuda_wrapper():
@@ -27,27 +23,3 @@ def torch_cuda_wrapper():
        yield
    finally:
        pass
@contextmanager
 def block_table_wrapper():
    try:
        # vllm-ascend need to initialize slot mapping as torch.int32 dtype,
        # but vllm default is torch.int64 dtype.
        vllm.v1.worker.gpu.model_runner.BlockTables = AscendBlockTables
        logger.info_once("Wrapping BlockTables with AscendBlockTables.")
        yield
    finally:
        pass
@contextmanager
 def model_states_wrapper():
    try:
        # prepare_attn in AscendModelState is different from vllm,
        # we need to override init_model_state.
        vllm.v1.worker.gpu.model_runner.init_model_state = init_asecnd_model_state
        logger.info_once("Wrapping init_model_state with init_asecnd_model_state.")
        yield
    finally:
        pass