adapt to main2main for model runner v2 (#7578)

### What this PR does / why we need it?
This PR aims to adapt to newest commit of vllm main branch for model
runner v2. please refer to
https://github.com/vllm-project/vllm-ascend/issues/5208
### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?

- vLLM version: v0.18.0
- vLLM main:
ed359c497a

---------

Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
This commit is contained in:
Ronald
2026-03-25 09:08:44 +08:00
committed by GitHub
parent fc3ec100bc
commit d96440924a
16 changed files with 239 additions and 264 deletions

View File

@@ -5,5 +5,5 @@ This directory contains the new model runner which is under active development.
please see [Model Runner V2](https://github.com/vllm-project/vllm-ascend/issues/5208)
to get specific plans.
supported vllm version: main@4034c3d32e30d01639459edd3ab486f56993876d
related PR: <https://github.com/vllm-project/vllm-ascend/pull/7110>
supported vllm version: main@ed359c497a728f08b5b41456c07a688ccd510fbc
related PR: <https://github.com/vllm-project/vllm-ascend/pull/7598>

View File

@@ -16,128 +16,68 @@
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from contextlib import contextmanager
from typing import Any
import numpy as np
import torch
import torch.nn as nn
import vllm
from vllm.config import VllmConfig
from vllm.config.compilation import CUDAGraphMode
from vllm.forward_context import get_forward_context, set_forward_context
from vllm.logger import logger
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.worker.gpu.attn_utils import build_slot_mappings_by_layer
from vllm.v1.worker.gpu.block_table import BlockTables
from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager
from vllm.v1.worker.gpu.cudagraph_utils import BatchExecutionDescriptor, ModelCudaGraphManager
from vllm.v1.worker.gpu.input_batch import InputBuffers
from vllm.v1.worker.gpu.model_states.interface import ModelState
from vllm.v1.worker.utils import AttentionGroup
from vllm_ascend.ascend_forward_context import _EXTRA_CTX
from vllm_ascend.compilation.acl_graph import set_graph_params, update_full_graph_params
from vllm_ascend.worker.v2.attn_utils import build_attn_metadata
from vllm_ascend.worker.v2.utils import torch_cuda_wrapper
class AclGraphManager(CudaGraphManager):
"""ACL Graph Manager for Ascend NPUs."""
class ModelAclGraphManager(ModelCudaGraphManager):
"""ACL Model Cuda Graph Manager for Ascend NPUs."""
def __init__(
self,
vllm_config: VllmConfig,
use_aux_hidden_state_outputs: bool,
device: torch.device,
model_runner: Any, # NPUModelRunner type, in case circular import, so we pass it as Any
cudagraph_mode: CUDAGraphMode,
decode_query_len: int,
model_runner: Any,
):
super().__init__(
vllm_config,
device,
cudagraph_mode,
decode_query_len,
)
# set model runner attribute, so we can access attributes model runner
# when call `run_fullgraph` method in CudaGraphManager,
# then we don't need to # copy `execute_model` method in `NPUModelRunner` class.
self.model_runner = model_runner
super().__init__(
vllm_config,
use_aux_hidden_state_outputs,
device,
)
# capture_sizes sorts in ascending order.
self.capture_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
# vllm-ascend need to update graph params of attention backend.
# so we need to set graph params before capture full graph.
if super().needs_capture():
set_graph_params(self.cudagraph_sizes)
set_graph_params(self.capture_sizes)
def _capture_full_graph(
self,
num_tokens: int,
num_reqs: int,
model: nn.Module,
input_ids: torch.Tensor,
positions: torch.Tensor,
inputs_embeds: torch.Tensor | None,
num_tokens_across_dp: torch.Tensor,
attn_metadata: dict[str, Any] | None,
slot_mappings: dict[str, torch.Tensor] | None,
has_lora: bool = False,
) -> None:
"""Override _capture_full_graph because we need to set capturing=True in forward context."""
# set capturing=True in before model forward.
model = ModelWithContext(model)
return super()._capture_full_graph(
num_tokens,
num_reqs,
model,
input_ids,
positions,
inputs_embeds,
num_tokens_across_dp,
attn_metadata,
slot_mappings,
has_lora,
)
def capture_graph(
self,
num_tokens: int,
capture_cg_mode: CUDAGraphMode,
model: nn.Module,
model_state: ModelState,
input_buffers: InputBuffers,
block_tables: BlockTables,
attn_groups: list[list[AttentionGroup]],
kv_cache_config: KVCacheConfig,
has_lora: bool = False,
uniform_decode: bool = False,
) -> None:
with torch_cuda_wrapper(), prepare_capture_inputs_wrapper():
super().capture_graph(
num_tokens,
capture_cg_mode,
model,
model_state,
input_buffers,
block_tables,
attn_groups,
kv_cache_config,
has_lora,
uniform_decode,
)
def run_fullgraph(self, num_tokens: int) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:
def run_fullgraph(self, desc: BatchExecutionDescriptor) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:
"""Override run_fullgraph to update full graph params in run_fullgraph."""
num_tokens = desc.num_tokens
logger.info_once(f"run_fullgraph with num_tokens={num_tokens}")
ret = super().run_fullgraph(num_tokens)
assert self.model_runner.cudagraph_and_dp_padding is not None
ret = super().run_fullgraph(desc)
positions = self.model_runner.input_buffers.positions[:num_tokens]
_num_tokens_after_padding, num_tokens_across_dp, synced_cudagraph_mode = (
self.model_runner.cudagraph_and_dp_padding
)
cudagraph_runtime_mode = CUDAGraphMode(synced_cudagraph_mode)
# refer to vllm.v1.worker.gpu.dp_utils.sync_cudagraph_and_dp_padding to
# calculate num_tokens_across_dp.
num_tokens_across_dp = torch.full([self.model_runner.dp_size], num_tokens, device=self.device)
with set_forward_context(
self.model_runner.input_batch.attn_metadata,
self.vllm_config,
num_tokens=num_tokens,
cudagraph_runtime_mode=cudagraph_runtime_mode,
cudagraph_runtime_mode=desc.cg_mode,
num_tokens_across_dp=num_tokens_across_dp,
batch_descriptor=None, # Full graph model don't need batch_descriptor
slot_mapping=self.model_runner.input_batch.slot_mappings,
@@ -155,79 +95,31 @@ class AclGraphManager(CudaGraphManager):
)
return ret
def is_uniform_decode(
def capture(
self,
num_reqs: int,
num_tokens: int,
max_query_len: int,
):
return (max_query_len == self.uniform_decode_query_len) and (num_tokens == max_query_len * num_reqs)
@contextmanager
def prepare_capture_inputs_wrapper():
"""Context manager to override input preparation for NPU graph capture."""
# TODO(Ronald1995): make prepare_inputs_to_capture as static method
# in CudaGraphManager.
ori = vllm.v1.worker.gpu.cudagraph_utils.prepare_inputs_to_capture
try:
vllm.v1.worker.gpu.cudagraph_utils.prepare_inputs_to_capture = prepare_inputs_to_capture
yield
finally:
vllm.v1.worker.gpu.cudagraph_utils.prepare_inputs_to_capture = ori
def prepare_inputs_to_capture(
num_reqs: int,
num_tokens: int,
input_buffers: InputBuffers,
block_tables: BlockTables,
attn_groups: list[list[AttentionGroup]],
max_model_len: int,
kv_cache_config: KVCacheConfig,
uniform_decode_query_len: int = 0,
) -> tuple[dict[str, Any], dict[str, torch.Tensor]]:
if uniform_decode_query_len > 0:
num_tokens_per_req = uniform_decode_query_len
else:
num_tokens_per_req = num_tokens // num_reqs
query_start_loc_np = np.arange(num_reqs + 1, dtype=np.int32) * num_tokens_per_req
query_start_loc_np[-1] = num_tokens
query_start_loc_cpu = torch.from_numpy(query_start_loc_np)
input_buffers.query_start_loc[: num_reqs + 1] = query_start_loc_cpu
input_buffers.query_start_loc[num_reqs + 1 :] = num_tokens
query_start_loc = input_buffers.query_start_loc[: num_reqs + 1]
# HACK(woosuk): For faster warmup, we set seq_lens (GPU) to num_tokens
# rather than max_model_len.
input_buffers.seq_lens[:num_reqs] = num_tokens
input_buffers.seq_lens[num_reqs:] = 0
input_buffers.seq_lens_cpu[:num_reqs] = num_tokens
input_buffers.seq_lens_cpu[num_reqs:] = 0
input_buffers.dcp_local_seq_lens[:num_reqs] = num_tokens
input_buffers.dcp_local_seq_lens[num_reqs:] = 0
input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables]
slot_mappings = block_tables.slot_mappings[:, :num_tokens]
slot_mappings_by_layer = build_slot_mappings_by_layer(slot_mappings, kv_cache_config)
attn_metadata = build_attn_metadata(
attn_groups=attn_groups,
num_reqs=num_reqs,
num_tokens=num_tokens,
query_start_loc_gpu=query_start_loc,
query_start_loc_cpu=query_start_loc_cpu,
max_query_len=num_tokens_per_req,
seq_lens=input_buffers.seq_lens,
max_seq_len=max_model_len,
block_tables=input_block_tables,
slot_mappings=slot_mappings,
kv_cache_config=kv_cache_config,
seq_lens_np=input_buffers.seq_lens_np,
)
return attn_metadata, slot_mappings_by_layer
model: nn.Module,
model_state: ModelState,
input_buffers: InputBuffers,
block_tables: BlockTables,
attn_groups: list[list[AttentionGroup]],
kv_cache_config: KVCacheConfig,
has_lora: bool = False,
use_aux_hidden_state_outputs: bool = False,
progress_bar_desc: str = "Capturing CUDA graphs",
) -> None:
"""Capture CUDA graphs for model forward pass."""
model = ModelWithContext(model)
return super().capture(
model,
model_state,
input_buffers,
block_tables,
attn_groups,
kv_cache_config,
has_lora,
use_aux_hidden_state_outputs,
progress_bar_desc,
)
class ModelWithContext(nn.Module):
@@ -242,6 +134,7 @@ class ModelWithContext(nn.Module):
def forward(self, *args, **kwargs):
# In warmup phase, capturing=False by default.
# when capturing, we need to set capturing=True in forward context.
_EXTRA_CTX.capturing = True
if torch.npu.is_current_stream_capturing():
_EXTRA_CTX.capturing = True
return self.original_model(*args, **kwargs)

View File

@@ -79,14 +79,12 @@ class AscendInputBatch(InputBatch):
num_reqs: int,
num_tokens: int,
input_buffers: AscendInputBuffers,
device: torch.device,
) -> "AscendInputBatch":
"""Override the make_dummy method to calculate seq_lens_np."""
input_batch = InputBatch.make_dummy(
num_reqs,
num_tokens,
input_buffers,
device,
)
# seq_len equals to query_len
input_buffers.seq_lens_np[:num_reqs] = num_tokens // num_reqs

View File

@@ -17,17 +17,13 @@
# This file is a part of the vllm-ascend project.
#
import functools
import numpy as np
import torch
import vllm
from vllm.config import VllmConfig
from vllm.config.compilation import CUDAGraphMode
from vllm.sequence import IntermediateTensors
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.worker.gpu.buffer_utils import async_copy_to_gpu
from vllm.v1.worker.gpu.cudagraph_utils import BatchExecutionDescriptor
from vllm.v1.worker.gpu.input_batch import (
combine_sampled_and_draft_tokens,
expand_idx_mapping,
@@ -38,21 +34,21 @@ from vllm.v1.worker.gpu.model_runner import GPUModelRunner
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.utils import set_weight_prefetch_method
from vllm_ascend.worker.v2.aclgraph_utils import AclGraphManager
from vllm_ascend.worker.v2.aclgraph_utils import ModelAclGraphManager
from vllm_ascend.worker.v2.attn_utils import build_attn_state
from vllm_ascend.worker.v2.input_batch import AscendInputBatch, AscendInputBuffers
from vllm_ascend.worker.v2.sample.sampler import AscendSampler
from vllm_ascend.worker.v2.spec_decode import init_speculator
from vllm_ascend.worker.v2.spec_decode.eagle import AscendEagleSpeculator
from vllm_ascend.worker.v2.states import AscendRequestState
from vllm_ascend.worker.v2.utils import block_table_wrapper, model_states_wrapper, torch_cuda_wrapper
from vllm_ascend.worker.v2.utils import torch_cuda_wrapper
class NPUModelRunner(GPUModelRunner):
"""Model runner for Ascend NPUs."""
def __init__(self, vllm_config: VllmConfig, device: torch.device):
with torch_cuda_wrapper(), block_table_wrapper(), model_states_wrapper():
with torch_cuda_wrapper():
super().__init__(vllm_config, device)
# because we will override these attribute, delete these attribute to
@@ -64,11 +60,12 @@ class NPUModelRunner(GPUModelRunner):
del self.speculator
# NPU specific initializations can be added below.
self.cudagraph_manager: AclGraphManager = AclGraphManager(
self.cudagraph_manager: ModelAclGraphManager = ModelAclGraphManager(
self.vllm_config,
self.use_aux_hidden_state_outputs,
self.device,
self,
self.compilation_config.cudagraph_mode,
decode_query_len=self.decode_query_len,
model_runner=self,
)
# we define AscendEagleSpeculator in vllm_ascend.worker.v2.spec_decode.eagle
@@ -138,50 +135,17 @@ class NPUModelRunner(GPUModelRunner):
# so we can inherit `execute_model` method.
self.input_batch: AscendInputBatch | None = None
@torch.inference_mode()
def execute_model(
self,
scheduler_output: SchedulerOutput,
intermediate_tensors: IntermediateTensors | None = None,
dummy_run: bool = False,
skip_attn_for_dummy_run: bool = False,
) -> ModelRunnerOutput | IntermediateTensors | None:
"""Override GPUModelRunner.execute_model for Ascend NPUs by there reasons:
1. when run fullgraph, we need to use ret value of `get_cudagraph_and_dp_padding`
to set forward_context in `run_fullgraph`.
"""
# use closure to store return value of get_cudagraph_and_dp_padding in model runner.
def wrapper(func):
@functools.wraps(func)
def inner(*args, **kwargs):
self.cudagraph_and_dp_padding = func(*args, **kwargs)
return self.cudagraph_and_dp_padding
return inner
if self.cudagraph_and_dp_padding is None:
vllm.v1.worker.gpu.model_runner.get_cudagraph_and_dp_padding = wrapper(
vllm.v1.worker.gpu.model_runner.get_cudagraph_and_dp_padding
)
return super().execute_model(
scheduler_output,
intermediate_tensors,
dummy_run,
skip_attn_for_dummy_run,
)
def prepare_inputs(
self,
scheduler_output: SchedulerOutput,
num_tokens_after_padding: int,
batch_desc: BatchExecutionDescriptor,
) -> AscendInputBatch:
"""Override GPUModelRunner.prepare_inputs for Ascend NPUs.
npu attention backends need seq_lens_cpu to work.
so we need to prepare seq_lens_cpu here.
"""
num_tokens = scheduler_output.total_num_scheduled_tokens
num_tokens_after_padding = batch_desc.num_tokens
assert num_tokens > 0
num_tokens_per_req = scheduler_output.num_scheduled_tokens
num_reqs = len(num_tokens_per_req)
@@ -247,6 +211,7 @@ class NPUModelRunner(GPUModelRunner):
# Get query_start_loc.
# NOTE: For FULL mode we change +1 to +2 to reserve extra space for padding.
# See _pad_query_start_loc_for_fia.
num_reqs_padded = batch_desc.num_reqs or num_reqs
query_start_loc_np = np.empty(self.max_num_reqs + 2, dtype=np.int32)
query_start_loc_np[0] = 0
np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1 : num_reqs + 1])
@@ -256,11 +221,12 @@ class NPUModelRunner(GPUModelRunner):
# This is only required for vllm-ascend.
query_start_loc_np, num_reqs_padded = self._pad_query_start_loc_for_fia(
num_tokens_padded=num_tokens_after_padding,
num_tokens=num_tokens,
num_reqs=num_reqs,
query_start_loc_np=query_start_loc_np,
max_query_len=max(scheduler_output.num_scheduled_tokens.values()),
num_tokens_after_padding,
num_reqs_padded,
num_reqs,
query_start_loc_np,
batch_desc.cg_mode,
batch_desc.num_reqs,
)
async_copy_to_gpu(query_start_loc_np, out=self.input_buffers.query_start_loc)
@@ -311,7 +277,8 @@ class NPUModelRunner(GPUModelRunner):
self.input_batch = AscendInputBatch(
req_ids=req_ids,
num_reqs=num_reqs_padded,
num_reqs=num_reqs,
num_reqs_after_padding=num_reqs_padded,
idx_mapping=idx_mapping,
idx_mapping_np=idx_mapping_np,
expanded_idx_mapping=expanded_idx_mapping,
@@ -394,37 +361,34 @@ class NPUModelRunner(GPUModelRunner):
def _pad_query_start_loc_for_fia(
self,
num_tokens_padded: int,
num_tokens: int,
num_reqs_padded: int,
num_reqs: int,
query_start_loc_np: np.ndarray,
max_query_len: int,
cudagraph_runtime_mode: CUDAGraphMode | None = None,
batch_desc_num_reqs: int | None = None,
) -> tuple[np.ndarray, int]:
"""
This function is only designed to satisfied the constraint that when the layout is TND,
the first dimension of `hidden_states` must equal the last element of `actual_seq_lengths_q`.
"""
assert self.cudagraph_and_dp_padding is not None
_num_tokens_after_padding, _num_tokens_across_dp, synced_cudagraph_mode = self.cudagraph_and_dp_padding
cudagraph_runtime_mode = CUDAGraphMode(synced_cudagraph_mode)
if cudagraph_runtime_mode != CUDAGraphMode.FULL:
return query_start_loc_np, num_reqs
uniform_decode_query_len = self.cudagraph_manager.uniform_decode_query_len
is_uniform_decode = self.cudagraph_manager.is_uniform_decode(
num_reqs=num_reqs,
num_tokens=num_tokens,
max_query_len=max_query_len,
)
if is_uniform_decode:
# TODO: need refactor later, related to vllm PR #34043 this pr delete func
# relax_for_mixed_batch_cudagraphs, num_reqs no longer equals the actual number of requests.
if cudagraph_runtime_mode == CUDAGraphMode.FULL:
num_reqs_padded = num_reqs
else:
num_reqs_padded = batch_desc_num_reqs if batch_desc_num_reqs is not None else num_reqs
if num_tokens_padded == num_reqs_padded * self.decode_query_len:
# Uniform-batch case: num_reqs must be no greater than num_reqs_padded
num_reqs_padded = num_tokens_padded // uniform_decode_query_len
assert num_reqs <= num_reqs_padded
last_loc = query_start_loc_np[num_reqs]
query_start_loc_np[num_reqs + 1 : num_reqs_padded + 1] = (
np.arange(1, num_reqs_padded + 1 - num_reqs) * uniform_decode_query_len + last_loc
np.arange(1, num_reqs_padded + 1 - num_reqs) * self.decode_query_len + last_loc
)
else:
# Mixed-batch case: num_reqs must equal num_reqs_padded
num_reqs_padded = min(num_tokens_padded, self.max_num_reqs)
assert num_reqs == num_reqs_padded
# Insert a dummy request instead of setting query_start_loc[num_reqs] = num_tokens_padded directly
query_start_loc_np[num_reqs_padded + 1] = num_tokens_padded

View File

@@ -20,6 +20,7 @@
from typing import Any
import torch
from vllm.config.compilation import CUDAGraphMode
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.worker.gpu.model_states.default import DefaultModelState
from vllm.v1.worker.utils import AttentionGroup
@@ -34,18 +35,28 @@ class AscendModelState(DefaultModelState):
def prepare_attn(
self,
input_batch: AscendInputBatch,
cudagraph_mode: CUDAGraphMode,
block_tables: tuple[torch.Tensor, ...],
slot_mappings: torch.Tensor,
attn_groups: list[list[AttentionGroup]],
kv_cache_config: KVCacheConfig,
for_capture: bool = False,
) -> dict[str, Any]:
"""Override prepare_attn method because `build_attn_metadata` is different from vllm."""
if cudagraph_mode == CUDAGraphMode.FULL:
# Use padded sizes - padding is handled by model_runner.prepare_attn.
num_reqs = input_batch.num_reqs_after_padding
num_tokens = input_batch.num_tokens_after_padding
else:
# For piecewise cudagraphs and eager, use unpadded sizes.
num_reqs = input_batch.num_reqs
num_tokens = input_batch.num_tokens
query_start_loc_cpu = torch.from_numpy(input_batch.query_start_loc_np)
max_query_len = input_batch.num_scheduled_tokens.max().item()
attn_metadata = build_attn_metadata(
attn_groups=attn_groups,
num_reqs=input_batch.num_reqs,
num_tokens=input_batch.num_tokens,
num_reqs=num_reqs,
num_tokens=num_tokens,
query_start_loc_gpu=input_batch.query_start_loc,
query_start_loc_cpu=query_start_loc_cpu,
max_query_len=max_query_len,

View File

@@ -1,12 +1,8 @@
from contextlib import contextmanager
import torch
import vllm
from vllm.logger import logger
from vllm_ascend.worker.v2.block_table import AscendBlockTables
from vllm_ascend.worker.v2.model_states import init_asecnd_model_state
@contextmanager
def torch_cuda_wrapper():
@@ -27,27 +23,3 @@ def torch_cuda_wrapper():
yield
finally:
pass
@contextmanager
def block_table_wrapper():
try:
# vllm-ascend need to initialize slot mapping as torch.int32 dtype,
# but vllm default is torch.int64 dtype.
vllm.v1.worker.gpu.model_runner.BlockTables = AscendBlockTables
logger.info_once("Wrapping BlockTables with AscendBlockTables.")
yield
finally:
pass
@contextmanager
def model_states_wrapper():
try:
# prepare_attn in AscendModelState is different from vllm,
# we need to override init_model_state.
vllm.v1.worker.gpu.model_runner.init_model_state = init_asecnd_model_state
logger.info_once("Wrapping init_model_state with init_asecnd_model_state.")
yield
finally:
pass