[Feature] support aclgraph for model runner v2 (#7110)

### What this PR does / why we need it? This PR aims to support aclgraph for model runner v2, please see RFC #5208. The PR contains these modifications: - adapt to newest commit of vllm main branch. - supply a unified interface of extra forward context for both model runner v1 and model runner v2. - implement graph mode for main model. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? - vLLM version: v0.16.0 - vLLM main: 4034c3d32e --------- Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
2026-03-13 09:11:46 +08:00
parent 1f71da80eb
commit c980e68d40
52 changed files with 840 additions and 309 deletions
--- a/vllm_ascend/worker/v2/input_batch.py
+++ b/vllm_ascend/worker/v2/input_batch.py
@@ -22,6 +22,8 @@ import numpy as np
 import torch
 from vllm.v1.worker.gpu.input_batch import InputBatch, InputBuffers

+from vllm_ascend.attention.attention_v1 import AscendAttentionState
+

 class AscendInputBuffers(InputBuffers):
    """Input buffers for Ascend NPUs."""
@@ -37,6 +39,16 @@ class AscendInputBuffers(InputBuffers):
            max_num_tokens,
            device,
        )
+        del self.query_start_loc
+
+        # NOTE: For FULL mode we change +1 to +2 to reserve extra space for padding.
+        # See _pad_query_start_loc_for_fia.
+        self.query_start_loc: torch.Tensor = torch.zeros(
+            max_num_reqs + 2,
+            dtype=torch.int32,
+            device=device,
+        )
+
        # Create seq_lens_cpu and seq_lens_np.
        # npu's attention backend still needs seq_lens on CPU side.
        self.seq_lens_cpu: torch.Tensor = torch.zeros(
@@ -56,6 +68,8 @@ class AscendInputBatch(InputBatch):
    # Create seq_lens_np.
    # npu's attention backend still needs seq_lens on CPU side.
    seq_lens_np: np.ndarray
+    # attn_state is used to build attention metadata.
+    attn_state: AscendAttentionState | None = None

    @classmethod
    def make_dummy(
@@ -79,4 +93,11 @@ class AscendInputBatch(InputBatch):
        input_buffers.seq_lens_np[num_reqs:] = 0
        seq_lens_np = input_buffers.seq_lens_np[:num_reqs]
        input_batch.seq_lens_np = seq_lens_np
+        # A dummy run for dp or memory profiling.
+        # When dummy run for dp, num_tokens is set to 1,
+        # so attn_state is set to DecodeOnly.
+        # when dummy run for memory profiling,
+        # attention metadata isn't needed,
+        # we can also set attn_state to AscendAttentionState.DecodeOnly.
+        input_batch.attn_state = AscendAttentionState.DecodeOnly
        return cls(**asdict(input_batch), seq_lens_np=seq_lens_np)