adapt to main2main for model runner v2 (#7578)

### What this PR does / why we need it?
This PR aims to adapt to newest commit of vllm main branch for model
runner v2. please refer to
https://github.com/vllm-project/vllm-ascend/issues/5208
### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?

- vLLM version: v0.18.0
- vLLM main:
ed359c497a

---------

Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
This commit is contained in:
Ronald
2026-03-25 09:08:44 +08:00
committed by GitHub
parent fc3ec100bc
commit d96440924a
16 changed files with 239 additions and 264 deletions

View File

@@ -312,7 +312,7 @@
# Future Plan: # Future Plan:
# Remove this patch when vLLM aligns with the latest processor implementation. # Remove this patch when vLLM aligns with the latest processor implementation.
# #
# ** 10. File: worker/patch_v2_eagle.py** # ** 10. File: worker/patch_v2/patch_eagle.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose` # 1. `vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose`
# Why: # Why:
@@ -348,7 +348,7 @@
# Future Plan: # Future Plan:
# Remove this patch when the PTA version used by vllm-ascend has been upgraded. # Remove this patch when the PTA version used by vllm-ascend has been upgraded.
# #
# ** 13. File: worker/patch_v2_uva.py** # ** 13. File: worker/patch_v2/patch_uva.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.v1.worker.gpu.states.UvaBuffer` # 1. `vllm.v1.worker.gpu.states.UvaBuffer`
# Why: # Why:
@@ -553,3 +553,48 @@
# Future Plan: # Future Plan:
# The maybe_remap_kv_scale_name function of the community is reconstructed to support # The maybe_remap_kv_scale_name function of the community is reconstructed to support
# multiple backends. # multiple backends.
# ** 24. File: worker/patch_v2/patch_input_batch.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.v1.worker.gpu.input_batch.InputBatch`
# Why:
# vllm use InputBatch to make dummy tensors. in `model_runner.py` and `cudagraph_utils.py`
# which make it difficult to inherit from vllm methods.
# How
# replace InputBatch with AscendInputBatch.
# Future Plan:
# remove this patch when vLLM-ascend's make_dummy behavior aligns with vLLM.
# ** 25. File: worker/patch_v2/patch_block_table.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.v1.worker.gpu.block_table.BlockTables`
# Why:
## vllm-ascend need to initialize slot mapping as torch.int32 dtype,
# but vllm default is torch.int64 dtype.
# How
# replace BlockTables with AscendBlockTables which initialize slot mapping
# as torch.int32 dtype.
# Future Plan:
# remove this patch when vLLM-ascend's BlockTables can initialize
# slot mapping as torch.int64 dtype.
# ** 25. File: worker/patch_v2/patch_model_state.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.v1.worker.gpu.model_states.default.init_model_state`
# Why:
## vllm's prepare_attn in ModelState is different from vllm,
# we need to override init_model_state.
# How
# Define AscendModelState and initialize it in init_model_state.
# Future Plan:
# remove this when vllm-ascend's attention metadata is align with vllm.
# ** 26. File: worker/patch_v2/patch_triton.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.v1.worker.gpu.sample.logprob`, `vllm.v1.worker.gpu.sample.penalties.apply_penalties`,
# `vllm.v1.worker.gpu.sample.gumbel.gumbel_sample`
# Why:
# triton ops in vLLM perform not good on NPU. And there is no dispatch mechanism for triton ops.
# How
# override triton ops in vLLM with ascend implementation
# Related PR (if no, explain why):
# Let vLLM support triton ops dispatch.
# Future Plan:
# Remove this patch when vLLM support the dispatch function.
#

View File

@@ -19,6 +19,7 @@ from vllm.triton_utils import HAS_TRITON
if HAS_TRITON: if HAS_TRITON:
import vllm_ascend.patch.worker.patch_triton import vllm_ascend.patch.worker.patch_triton
import vllm_ascend.patch.worker.patch_v2.patch_triton # noqa
# isort: off # isort: off
@@ -36,8 +37,8 @@ import vllm_ascend.patch.worker.patch_qwen3_next # noqa
import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa
import vllm_ascend.patch.worker.patch_qwen3_5 # noqa import vllm_ascend.patch.worker.patch_qwen3_5 # noqa
import vllm_ascend.patch.worker.patch_rejection_sampler # noqa import vllm_ascend.patch.worker.patch_rejection_sampler # noqa
import vllm_ascend.patch.worker.patch_v2_eagle # noqa import vllm_ascend.patch.worker.patch_v2.patch_eagle # noqa
import vllm_ascend.patch.worker.patch_v2_uva # noqa import vllm_ascend.patch.worker.patch_v2.patch_uva # noqa
import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa
import vllm_ascend.patch.worker.patch_routed_experts_capturer # noqa import vllm_ascend.patch.worker.patch_routed_experts_capturer # noqa
import vllm_ascend.patch.worker.patch_npugraph_ex_triton # noqa import vllm_ascend.patch.worker.patch_npugraph_ex_triton # noqa
@@ -45,3 +46,6 @@ import vllm_ascend.patch.worker.patch_kimi_k25 # noqa
import vllm_ascend.patch.worker.patch_draft_quarot # noqa import vllm_ascend.patch.worker.patch_draft_quarot # noqa
import vllm_ascend.patch.worker.patch_cudagraph # noqa import vllm_ascend.patch.worker.patch_cudagraph # noqa
import vllm_ascend.patch.worker.patch_deepseek_mtp # noqa import vllm_ascend.patch.worker.patch_deepseek_mtp # noqa
import vllm_ascend.patch.worker.patch_v2.patch_input_batch # noqa
import vllm_ascend.patch.worker.patch_v2.patch_model_state # noqa
import vllm_ascend.patch.worker.patch_v2.patch_block_table # noqa

View File

@@ -5,7 +5,6 @@ from vllm_ascend.ops.triton.fla.chunk import chunk_gated_delta_rule
from vllm_ascend.ops.triton.fla.layernorm_guard import LayerNormFn from vllm_ascend.ops.triton.fla.layernorm_guard import LayerNormFn
from vllm_ascend.ops.triton.fla.sigmoid_gating import fused_recurrent_gated_delta_rule_fwd_kernel from vllm_ascend.ops.triton.fla.sigmoid_gating import fused_recurrent_gated_delta_rule_fwd_kernel
from vllm_ascend.ops.triton.mamba.causal_conv1d import causal_conv1d_fn, causal_conv1d_update_npu from vllm_ascend.ops.triton.mamba.causal_conv1d import causal_conv1d_fn, causal_conv1d_update_npu
from vllm_ascend.worker.v2.sample.gumbel import gumbel_sample as ascend_gumbel_sample
vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal_conv1d_update_npu vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal_conv1d_update_npu
vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn
@@ -14,4 +13,3 @@ vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_r
) )
vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn
vllm.model_executor.layers.fla.ops.chunk_gated_delta_rule = chunk_gated_delta_rule vllm.model_executor.layers.fla.ops.chunk_gated_delta_rule = chunk_gated_delta_rule
vllm.v1.worker.gpu.sample.gumbel.gumbel_sample = ascend_gumbel_sample

View File

@@ -0,0 +1,25 @@
# Adapt from https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu/block_table.py
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from vllm.v1.worker.gpu import model_runner
from vllm_ascend.worker.v2.block_table import AscendBlockTables
# vllm-ascend need to initialize slot mapping as torch.int32 dtype,
# but vllm default is torch.int64 dtype.
model_runner.BlockTables = AscendBlockTables

View File

@@ -0,0 +1,27 @@
# Adapt from https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu/input_batch.py
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
# 显式导入模块,确保模块被加载后再进行 patch
from vllm.v1.worker.gpu import cudagraph_utils, model_runner
from vllm_ascend.worker.v2.input_batch import AscendInputBatch
cudagraph_utils.InputBatch = AscendInputBatch
model_runner.InputBatch = AscendInputBatch

View File

@@ -0,0 +1,26 @@
# Adapt from https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu/model_states/default.py
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from vllm.v1.worker.gpu import model_runner
from vllm_ascend.worker.v2.model_states import init_asecnd_model_state
# prepare_attn in AscendModelState is different from vllm,
# we need to override init_model_state.
model_runner.init_model_state = init_asecnd_model_state

View File

@@ -0,0 +1,12 @@
from vllm.v1.worker.gpu import input_batch
from vllm.v1.worker.gpu.sample import gumbel, logprob, penalties
from vllm_ascend.worker.v2.input_batch import post_update
from vllm_ascend.worker.v2.sample.gumbel import gumbel_sample
from vllm_ascend.worker.v2.sample.logprob import compute_token_logprobs
from vllm_ascend.worker.v2.sample.penalties import apply_penalties
logprob.compute_token_logprobs = compute_token_logprobs
penalties.apply_penalties = apply_penalties
gumbel.gumbel_sample = gumbel_sample
input_batch.post_update = post_update

View File

@@ -5,5 +5,5 @@ This directory contains the new model runner which is under active development.
please see [Model Runner V2](https://github.com/vllm-project/vllm-ascend/issues/5208) please see [Model Runner V2](https://github.com/vllm-project/vllm-ascend/issues/5208)
to get specific plans. to get specific plans.
supported vllm version: main@4034c3d32e30d01639459edd3ab486f56993876d supported vllm version: main@ed359c497a728f08b5b41456c07a688ccd510fbc
related PR: <https://github.com/vllm-project/vllm-ascend/pull/7110> related PR: <https://github.com/vllm-project/vllm-ascend/pull/7598>

View File

@@ -16,128 +16,68 @@
# limitations under the License. # limitations under the License.
# This file is a part of the vllm-ascend project. # This file is a part of the vllm-ascend project.
# #
from contextlib import contextmanager
from typing import Any from typing import Any
import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
import vllm
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.compilation import CUDAGraphMode from vllm.config.compilation import CUDAGraphMode
from vllm.forward_context import get_forward_context, set_forward_context from vllm.forward_context import get_forward_context, set_forward_context
from vllm.logger import logger from vllm.logger import logger
from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.worker.gpu.attn_utils import build_slot_mappings_by_layer
from vllm.v1.worker.gpu.block_table import BlockTables from vllm.v1.worker.gpu.block_table import BlockTables
from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager from vllm.v1.worker.gpu.cudagraph_utils import BatchExecutionDescriptor, ModelCudaGraphManager
from vllm.v1.worker.gpu.input_batch import InputBuffers from vllm.v1.worker.gpu.input_batch import InputBuffers
from vllm.v1.worker.gpu.model_states.interface import ModelState from vllm.v1.worker.gpu.model_states.interface import ModelState
from vllm.v1.worker.utils import AttentionGroup from vllm.v1.worker.utils import AttentionGroup
from vllm_ascend.ascend_forward_context import _EXTRA_CTX from vllm_ascend.ascend_forward_context import _EXTRA_CTX
from vllm_ascend.compilation.acl_graph import set_graph_params, update_full_graph_params from vllm_ascend.compilation.acl_graph import set_graph_params, update_full_graph_params
from vllm_ascend.worker.v2.attn_utils import build_attn_metadata
from vllm_ascend.worker.v2.utils import torch_cuda_wrapper
class AclGraphManager(CudaGraphManager): class ModelAclGraphManager(ModelCudaGraphManager):
"""ACL Graph Manager for Ascend NPUs.""" """ACL Model Cuda Graph Manager for Ascend NPUs."""
def __init__( def __init__(
self, self,
vllm_config: VllmConfig, vllm_config: VllmConfig,
use_aux_hidden_state_outputs: bool,
device: torch.device, device: torch.device,
model_runner: Any, # NPUModelRunner type, in case circular import, so we pass it as Any cudagraph_mode: CUDAGraphMode,
decode_query_len: int,
model_runner: Any,
): ):
super().__init__(
vllm_config,
device,
cudagraph_mode,
decode_query_len,
)
# set model runner attribute, so we can access attributes model runner # set model runner attribute, so we can access attributes model runner
# when call `run_fullgraph` method in CudaGraphManager, # when call `run_fullgraph` method in CudaGraphManager,
# then we don't need to # copy `execute_model` method in `NPUModelRunner` class. # then we don't need to # copy `execute_model` method in `NPUModelRunner` class.
self.model_runner = model_runner self.model_runner = model_runner
super().__init__( # capture_sizes sorts in ascending order.
vllm_config, self.capture_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
use_aux_hidden_state_outputs,
device,
)
# vllm-ascend need to update graph params of attention backend. # vllm-ascend need to update graph params of attention backend.
# so we need to set graph params before capture full graph. # so we need to set graph params before capture full graph.
if super().needs_capture(): if super().needs_capture():
set_graph_params(self.cudagraph_sizes) set_graph_params(self.capture_sizes)
def _capture_full_graph( def run_fullgraph(self, desc: BatchExecutionDescriptor) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:
self,
num_tokens: int,
num_reqs: int,
model: nn.Module,
input_ids: torch.Tensor,
positions: torch.Tensor,
inputs_embeds: torch.Tensor | None,
num_tokens_across_dp: torch.Tensor,
attn_metadata: dict[str, Any] | None,
slot_mappings: dict[str, torch.Tensor] | None,
has_lora: bool = False,
) -> None:
"""Override _capture_full_graph because we need to set capturing=True in forward context."""
# set capturing=True in before model forward.
model = ModelWithContext(model)
return super()._capture_full_graph(
num_tokens,
num_reqs,
model,
input_ids,
positions,
inputs_embeds,
num_tokens_across_dp,
attn_metadata,
slot_mappings,
has_lora,
)
def capture_graph(
self,
num_tokens: int,
capture_cg_mode: CUDAGraphMode,
model: nn.Module,
model_state: ModelState,
input_buffers: InputBuffers,
block_tables: BlockTables,
attn_groups: list[list[AttentionGroup]],
kv_cache_config: KVCacheConfig,
has_lora: bool = False,
uniform_decode: bool = False,
) -> None:
with torch_cuda_wrapper(), prepare_capture_inputs_wrapper():
super().capture_graph(
num_tokens,
capture_cg_mode,
model,
model_state,
input_buffers,
block_tables,
attn_groups,
kv_cache_config,
has_lora,
uniform_decode,
)
def run_fullgraph(self, num_tokens: int) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:
"""Override run_fullgraph to update full graph params in run_fullgraph.""" """Override run_fullgraph to update full graph params in run_fullgraph."""
num_tokens = desc.num_tokens
logger.info_once(f"run_fullgraph with num_tokens={num_tokens}") logger.info_once(f"run_fullgraph with num_tokens={num_tokens}")
ret = super().run_fullgraph(num_tokens) ret = super().run_fullgraph(desc)
assert self.model_runner.cudagraph_and_dp_padding is not None
positions = self.model_runner.input_buffers.positions[:num_tokens] positions = self.model_runner.input_buffers.positions[:num_tokens]
_num_tokens_after_padding, num_tokens_across_dp, synced_cudagraph_mode = ( # refer to vllm.v1.worker.gpu.dp_utils.sync_cudagraph_and_dp_padding to
self.model_runner.cudagraph_and_dp_padding # calculate num_tokens_across_dp.
) num_tokens_across_dp = torch.full([self.model_runner.dp_size], num_tokens, device=self.device)
cudagraph_runtime_mode = CUDAGraphMode(synced_cudagraph_mode)
with set_forward_context( with set_forward_context(
self.model_runner.input_batch.attn_metadata, self.model_runner.input_batch.attn_metadata,
self.vllm_config, self.vllm_config,
num_tokens=num_tokens, num_tokens=num_tokens,
cudagraph_runtime_mode=cudagraph_runtime_mode, cudagraph_runtime_mode=desc.cg_mode,
num_tokens_across_dp=num_tokens_across_dp, num_tokens_across_dp=num_tokens_across_dp,
batch_descriptor=None, # Full graph model don't need batch_descriptor batch_descriptor=None, # Full graph model don't need batch_descriptor
slot_mapping=self.model_runner.input_batch.slot_mappings, slot_mapping=self.model_runner.input_batch.slot_mappings,
@@ -155,79 +95,31 @@ class AclGraphManager(CudaGraphManager):
) )
return ret return ret
def is_uniform_decode( def capture(
self, self,
num_reqs: int, model: nn.Module,
num_tokens: int, model_state: ModelState,
max_query_len: int, input_buffers: InputBuffers,
): block_tables: BlockTables,
return (max_query_len == self.uniform_decode_query_len) and (num_tokens == max_query_len * num_reqs) attn_groups: list[list[AttentionGroup]],
kv_cache_config: KVCacheConfig,
has_lora: bool = False,
@contextmanager use_aux_hidden_state_outputs: bool = False,
def prepare_capture_inputs_wrapper(): progress_bar_desc: str = "Capturing CUDA graphs",
"""Context manager to override input preparation for NPU graph capture.""" ) -> None:
# TODO(Ronald1995): make prepare_inputs_to_capture as static method """Capture CUDA graphs for model forward pass."""
# in CudaGraphManager. model = ModelWithContext(model)
ori = vllm.v1.worker.gpu.cudagraph_utils.prepare_inputs_to_capture return super().capture(
try: model,
vllm.v1.worker.gpu.cudagraph_utils.prepare_inputs_to_capture = prepare_inputs_to_capture model_state,
yield input_buffers,
finally: block_tables,
vllm.v1.worker.gpu.cudagraph_utils.prepare_inputs_to_capture = ori attn_groups,
kv_cache_config,
has_lora,
def prepare_inputs_to_capture( use_aux_hidden_state_outputs,
num_reqs: int, progress_bar_desc,
num_tokens: int, )
input_buffers: InputBuffers,
block_tables: BlockTables,
attn_groups: list[list[AttentionGroup]],
max_model_len: int,
kv_cache_config: KVCacheConfig,
uniform_decode_query_len: int = 0,
) -> tuple[dict[str, Any], dict[str, torch.Tensor]]:
if uniform_decode_query_len > 0:
num_tokens_per_req = uniform_decode_query_len
else:
num_tokens_per_req = num_tokens // num_reqs
query_start_loc_np = np.arange(num_reqs + 1, dtype=np.int32) * num_tokens_per_req
query_start_loc_np[-1] = num_tokens
query_start_loc_cpu = torch.from_numpy(query_start_loc_np)
input_buffers.query_start_loc[: num_reqs + 1] = query_start_loc_cpu
input_buffers.query_start_loc[num_reqs + 1 :] = num_tokens
query_start_loc = input_buffers.query_start_loc[: num_reqs + 1]
# HACK(woosuk): For faster warmup, we set seq_lens (GPU) to num_tokens
# rather than max_model_len.
input_buffers.seq_lens[:num_reqs] = num_tokens
input_buffers.seq_lens[num_reqs:] = 0
input_buffers.seq_lens_cpu[:num_reqs] = num_tokens
input_buffers.seq_lens_cpu[num_reqs:] = 0
input_buffers.dcp_local_seq_lens[:num_reqs] = num_tokens
input_buffers.dcp_local_seq_lens[num_reqs:] = 0
input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables]
slot_mappings = block_tables.slot_mappings[:, :num_tokens]
slot_mappings_by_layer = build_slot_mappings_by_layer(slot_mappings, kv_cache_config)
attn_metadata = build_attn_metadata(
attn_groups=attn_groups,
num_reqs=num_reqs,
num_tokens=num_tokens,
query_start_loc_gpu=query_start_loc,
query_start_loc_cpu=query_start_loc_cpu,
max_query_len=num_tokens_per_req,
seq_lens=input_buffers.seq_lens,
max_seq_len=max_model_len,
block_tables=input_block_tables,
slot_mappings=slot_mappings,
kv_cache_config=kv_cache_config,
seq_lens_np=input_buffers.seq_lens_np,
)
return attn_metadata, slot_mappings_by_layer
class ModelWithContext(nn.Module): class ModelWithContext(nn.Module):
@@ -242,6 +134,7 @@ class ModelWithContext(nn.Module):
def forward(self, *args, **kwargs): def forward(self, *args, **kwargs):
# In warmup phase, capturing=False by default. # In warmup phase, capturing=False by default.
# when capturing, we need to set capturing=True in forward context. # when capturing, we need to set capturing=True in forward context.
_EXTRA_CTX.capturing = True if torch.npu.is_current_stream_capturing():
_EXTRA_CTX.capturing = True
return self.original_model(*args, **kwargs) return self.original_model(*args, **kwargs)

View File

@@ -79,14 +79,12 @@ class AscendInputBatch(InputBatch):
num_reqs: int, num_reqs: int,
num_tokens: int, num_tokens: int,
input_buffers: AscendInputBuffers, input_buffers: AscendInputBuffers,
device: torch.device,
) -> "AscendInputBatch": ) -> "AscendInputBatch":
"""Override the make_dummy method to calculate seq_lens_np.""" """Override the make_dummy method to calculate seq_lens_np."""
input_batch = InputBatch.make_dummy( input_batch = InputBatch.make_dummy(
num_reqs, num_reqs,
num_tokens, num_tokens,
input_buffers, input_buffers,
device,
) )
# seq_len equals to query_len # seq_len equals to query_len
input_buffers.seq_lens_np[:num_reqs] = num_tokens // num_reqs input_buffers.seq_lens_np[:num_reqs] = num_tokens // num_reqs

View File

@@ -17,17 +17,13 @@
# This file is a part of the vllm-ascend project. # This file is a part of the vllm-ascend project.
# #
import functools
import numpy as np import numpy as np
import torch import torch
import vllm
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.compilation import CUDAGraphMode from vllm.config.compilation import CUDAGraphMode
from vllm.sequence import IntermediateTensors
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.worker.gpu.buffer_utils import async_copy_to_gpu from vllm.v1.worker.gpu.buffer_utils import async_copy_to_gpu
from vllm.v1.worker.gpu.cudagraph_utils import BatchExecutionDescriptor
from vllm.v1.worker.gpu.input_batch import ( from vllm.v1.worker.gpu.input_batch import (
combine_sampled_and_draft_tokens, combine_sampled_and_draft_tokens,
expand_idx_mapping, expand_idx_mapping,
@@ -38,21 +34,21 @@ from vllm.v1.worker.gpu.model_runner import GPUModelRunner
from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.utils import set_weight_prefetch_method from vllm_ascend.utils import set_weight_prefetch_method
from vllm_ascend.worker.v2.aclgraph_utils import AclGraphManager from vllm_ascend.worker.v2.aclgraph_utils import ModelAclGraphManager
from vllm_ascend.worker.v2.attn_utils import build_attn_state from vllm_ascend.worker.v2.attn_utils import build_attn_state
from vllm_ascend.worker.v2.input_batch import AscendInputBatch, AscendInputBuffers from vllm_ascend.worker.v2.input_batch import AscendInputBatch, AscendInputBuffers
from vllm_ascend.worker.v2.sample.sampler import AscendSampler from vllm_ascend.worker.v2.sample.sampler import AscendSampler
from vllm_ascend.worker.v2.spec_decode import init_speculator from vllm_ascend.worker.v2.spec_decode import init_speculator
from vllm_ascend.worker.v2.spec_decode.eagle import AscendEagleSpeculator from vllm_ascend.worker.v2.spec_decode.eagle import AscendEagleSpeculator
from vllm_ascend.worker.v2.states import AscendRequestState from vllm_ascend.worker.v2.states import AscendRequestState
from vllm_ascend.worker.v2.utils import block_table_wrapper, model_states_wrapper, torch_cuda_wrapper from vllm_ascend.worker.v2.utils import torch_cuda_wrapper
class NPUModelRunner(GPUModelRunner): class NPUModelRunner(GPUModelRunner):
"""Model runner for Ascend NPUs.""" """Model runner for Ascend NPUs."""
def __init__(self, vllm_config: VllmConfig, device: torch.device): def __init__(self, vllm_config: VllmConfig, device: torch.device):
with torch_cuda_wrapper(), block_table_wrapper(), model_states_wrapper(): with torch_cuda_wrapper():
super().__init__(vllm_config, device) super().__init__(vllm_config, device)
# because we will override these attribute, delete these attribute to # because we will override these attribute, delete these attribute to
@@ -64,11 +60,12 @@ class NPUModelRunner(GPUModelRunner):
del self.speculator del self.speculator
# NPU specific initializations can be added below. # NPU specific initializations can be added below.
self.cudagraph_manager: AclGraphManager = AclGraphManager( self.cudagraph_manager: ModelAclGraphManager = ModelAclGraphManager(
self.vllm_config, self.vllm_config,
self.use_aux_hidden_state_outputs,
self.device, self.device,
self, self.compilation_config.cudagraph_mode,
decode_query_len=self.decode_query_len,
model_runner=self,
) )
# we define AscendEagleSpeculator in vllm_ascend.worker.v2.spec_decode.eagle # we define AscendEagleSpeculator in vllm_ascend.worker.v2.spec_decode.eagle
@@ -138,50 +135,17 @@ class NPUModelRunner(GPUModelRunner):
# so we can inherit `execute_model` method. # so we can inherit `execute_model` method.
self.input_batch: AscendInputBatch | None = None self.input_batch: AscendInputBatch | None = None
@torch.inference_mode()
def execute_model(
self,
scheduler_output: SchedulerOutput,
intermediate_tensors: IntermediateTensors | None = None,
dummy_run: bool = False,
skip_attn_for_dummy_run: bool = False,
) -> ModelRunnerOutput | IntermediateTensors | None:
"""Override GPUModelRunner.execute_model for Ascend NPUs by there reasons:
1. when run fullgraph, we need to use ret value of `get_cudagraph_and_dp_padding`
to set forward_context in `run_fullgraph`.
"""
# use closure to store return value of get_cudagraph_and_dp_padding in model runner.
def wrapper(func):
@functools.wraps(func)
def inner(*args, **kwargs):
self.cudagraph_and_dp_padding = func(*args, **kwargs)
return self.cudagraph_and_dp_padding
return inner
if self.cudagraph_and_dp_padding is None:
vllm.v1.worker.gpu.model_runner.get_cudagraph_and_dp_padding = wrapper(
vllm.v1.worker.gpu.model_runner.get_cudagraph_and_dp_padding
)
return super().execute_model(
scheduler_output,
intermediate_tensors,
dummy_run,
skip_attn_for_dummy_run,
)
def prepare_inputs( def prepare_inputs(
self, self,
scheduler_output: SchedulerOutput, scheduler_output: SchedulerOutput,
num_tokens_after_padding: int, batch_desc: BatchExecutionDescriptor,
) -> AscendInputBatch: ) -> AscendInputBatch:
"""Override GPUModelRunner.prepare_inputs for Ascend NPUs. """Override GPUModelRunner.prepare_inputs for Ascend NPUs.
npu attention backends need seq_lens_cpu to work. npu attention backends need seq_lens_cpu to work.
so we need to prepare seq_lens_cpu here. so we need to prepare seq_lens_cpu here.
""" """
num_tokens = scheduler_output.total_num_scheduled_tokens num_tokens = scheduler_output.total_num_scheduled_tokens
num_tokens_after_padding = batch_desc.num_tokens
assert num_tokens > 0 assert num_tokens > 0
num_tokens_per_req = scheduler_output.num_scheduled_tokens num_tokens_per_req = scheduler_output.num_scheduled_tokens
num_reqs = len(num_tokens_per_req) num_reqs = len(num_tokens_per_req)
@@ -247,6 +211,7 @@ class NPUModelRunner(GPUModelRunner):
# Get query_start_loc. # Get query_start_loc.
# NOTE: For FULL mode we change +1 to +2 to reserve extra space for padding. # NOTE: For FULL mode we change +1 to +2 to reserve extra space for padding.
# See _pad_query_start_loc_for_fia. # See _pad_query_start_loc_for_fia.
num_reqs_padded = batch_desc.num_reqs or num_reqs
query_start_loc_np = np.empty(self.max_num_reqs + 2, dtype=np.int32) query_start_loc_np = np.empty(self.max_num_reqs + 2, dtype=np.int32)
query_start_loc_np[0] = 0 query_start_loc_np[0] = 0
np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1 : num_reqs + 1]) np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1 : num_reqs + 1])
@@ -256,11 +221,12 @@ class NPUModelRunner(GPUModelRunner):
# This is only required for vllm-ascend. # This is only required for vllm-ascend.
query_start_loc_np, num_reqs_padded = self._pad_query_start_loc_for_fia( query_start_loc_np, num_reqs_padded = self._pad_query_start_loc_for_fia(
num_tokens_padded=num_tokens_after_padding, num_tokens_after_padding,
num_tokens=num_tokens, num_reqs_padded,
num_reqs=num_reqs, num_reqs,
query_start_loc_np=query_start_loc_np, query_start_loc_np,
max_query_len=max(scheduler_output.num_scheduled_tokens.values()), batch_desc.cg_mode,
batch_desc.num_reqs,
) )
async_copy_to_gpu(query_start_loc_np, out=self.input_buffers.query_start_loc) async_copy_to_gpu(query_start_loc_np, out=self.input_buffers.query_start_loc)
@@ -311,7 +277,8 @@ class NPUModelRunner(GPUModelRunner):
self.input_batch = AscendInputBatch( self.input_batch = AscendInputBatch(
req_ids=req_ids, req_ids=req_ids,
num_reqs=num_reqs_padded, num_reqs=num_reqs,
num_reqs_after_padding=num_reqs_padded,
idx_mapping=idx_mapping, idx_mapping=idx_mapping,
idx_mapping_np=idx_mapping_np, idx_mapping_np=idx_mapping_np,
expanded_idx_mapping=expanded_idx_mapping, expanded_idx_mapping=expanded_idx_mapping,
@@ -394,37 +361,34 @@ class NPUModelRunner(GPUModelRunner):
def _pad_query_start_loc_for_fia( def _pad_query_start_loc_for_fia(
self, self,
num_tokens_padded: int, num_tokens_padded: int,
num_tokens: int, num_reqs_padded: int,
num_reqs: int, num_reqs: int,
query_start_loc_np: np.ndarray, query_start_loc_np: np.ndarray,
max_query_len: int, cudagraph_runtime_mode: CUDAGraphMode | None = None,
batch_desc_num_reqs: int | None = None,
) -> tuple[np.ndarray, int]: ) -> tuple[np.ndarray, int]:
""" """
This function is only designed to satisfied the constraint that when the layout is TND, This function is only designed to satisfied the constraint that when the layout is TND,
the first dimension of `hidden_states` must equal the last element of `actual_seq_lengths_q`. the first dimension of `hidden_states` must equal the last element of `actual_seq_lengths_q`.
""" """
assert self.cudagraph_and_dp_padding is not None # TODO: need refactor later, related to vllm PR #34043 this pr delete func
_num_tokens_after_padding, _num_tokens_across_dp, synced_cudagraph_mode = self.cudagraph_and_dp_padding # relax_for_mixed_batch_cudagraphs, num_reqs no longer equals the actual number of requests.
cudagraph_runtime_mode = CUDAGraphMode(synced_cudagraph_mode) if cudagraph_runtime_mode == CUDAGraphMode.FULL:
if cudagraph_runtime_mode != CUDAGraphMode.FULL: num_reqs_padded = num_reqs
return query_start_loc_np, num_reqs else:
uniform_decode_query_len = self.cudagraph_manager.uniform_decode_query_len num_reqs_padded = batch_desc_num_reqs if batch_desc_num_reqs is not None else num_reqs
is_uniform_decode = self.cudagraph_manager.is_uniform_decode(
num_reqs=num_reqs, if num_tokens_padded == num_reqs_padded * self.decode_query_len:
num_tokens=num_tokens,
max_query_len=max_query_len,
)
if is_uniform_decode:
# Uniform-batch case: num_reqs must be no greater than num_reqs_padded # Uniform-batch case: num_reqs must be no greater than num_reqs_padded
num_reqs_padded = num_tokens_padded // uniform_decode_query_len assert num_reqs <= num_reqs_padded
last_loc = query_start_loc_np[num_reqs] last_loc = query_start_loc_np[num_reqs]
query_start_loc_np[num_reqs + 1 : num_reqs_padded + 1] = ( query_start_loc_np[num_reqs + 1 : num_reqs_padded + 1] = (
np.arange(1, num_reqs_padded + 1 - num_reqs) * uniform_decode_query_len + last_loc np.arange(1, num_reqs_padded + 1 - num_reqs) * self.decode_query_len + last_loc
) )
else: else:
# Mixed-batch case: num_reqs must equal num_reqs_padded # Mixed-batch case: num_reqs must equal num_reqs_padded
num_reqs_padded = min(num_tokens_padded, self.max_num_reqs) assert num_reqs == num_reqs_padded
# Insert a dummy request instead of setting query_start_loc[num_reqs] = num_tokens_padded directly # Insert a dummy request instead of setting query_start_loc[num_reqs] = num_tokens_padded directly
query_start_loc_np[num_reqs_padded + 1] = num_tokens_padded query_start_loc_np[num_reqs_padded + 1] = num_tokens_padded

View File

@@ -20,6 +20,7 @@
from typing import Any from typing import Any
import torch import torch
from vllm.config.compilation import CUDAGraphMode
from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.worker.gpu.model_states.default import DefaultModelState from vllm.v1.worker.gpu.model_states.default import DefaultModelState
from vllm.v1.worker.utils import AttentionGroup from vllm.v1.worker.utils import AttentionGroup
@@ -34,18 +35,28 @@ class AscendModelState(DefaultModelState):
def prepare_attn( def prepare_attn(
self, self,
input_batch: AscendInputBatch, input_batch: AscendInputBatch,
cudagraph_mode: CUDAGraphMode,
block_tables: tuple[torch.Tensor, ...], block_tables: tuple[torch.Tensor, ...],
slot_mappings: torch.Tensor, slot_mappings: torch.Tensor,
attn_groups: list[list[AttentionGroup]], attn_groups: list[list[AttentionGroup]],
kv_cache_config: KVCacheConfig, kv_cache_config: KVCacheConfig,
for_capture: bool = False,
) -> dict[str, Any]: ) -> dict[str, Any]:
"""Override prepare_attn method because `build_attn_metadata` is different from vllm.""" """Override prepare_attn method because `build_attn_metadata` is different from vllm."""
if cudagraph_mode == CUDAGraphMode.FULL:
# Use padded sizes - padding is handled by model_runner.prepare_attn.
num_reqs = input_batch.num_reqs_after_padding
num_tokens = input_batch.num_tokens_after_padding
else:
# For piecewise cudagraphs and eager, use unpadded sizes.
num_reqs = input_batch.num_reqs
num_tokens = input_batch.num_tokens
query_start_loc_cpu = torch.from_numpy(input_batch.query_start_loc_np) query_start_loc_cpu = torch.from_numpy(input_batch.query_start_loc_np)
max_query_len = input_batch.num_scheduled_tokens.max().item() max_query_len = input_batch.num_scheduled_tokens.max().item()
attn_metadata = build_attn_metadata( attn_metadata = build_attn_metadata(
attn_groups=attn_groups, attn_groups=attn_groups,
num_reqs=input_batch.num_reqs, num_reqs=num_reqs,
num_tokens=input_batch.num_tokens, num_tokens=num_tokens,
query_start_loc_gpu=input_batch.query_start_loc, query_start_loc_gpu=input_batch.query_start_loc,
query_start_loc_cpu=query_start_loc_cpu, query_start_loc_cpu=query_start_loc_cpu,
max_query_len=max_query_len, max_query_len=max_query_len,

View File

@@ -1,12 +1,8 @@
from contextlib import contextmanager from contextlib import contextmanager
import torch import torch
import vllm
from vllm.logger import logger from vllm.logger import logger
from vllm_ascend.worker.v2.block_table import AscendBlockTables
from vllm_ascend.worker.v2.model_states import init_asecnd_model_state
@contextmanager @contextmanager
def torch_cuda_wrapper(): def torch_cuda_wrapper():
@@ -27,27 +23,3 @@ def torch_cuda_wrapper():
yield yield
finally: finally:
pass pass
@contextmanager
def block_table_wrapper():
try:
# vllm-ascend need to initialize slot mapping as torch.int32 dtype,
# but vllm default is torch.int64 dtype.
vllm.v1.worker.gpu.model_runner.BlockTables = AscendBlockTables
logger.info_once("Wrapping BlockTables with AscendBlockTables.")
yield
finally:
pass
@contextmanager
def model_states_wrapper():
try:
# prepare_attn in AscendModelState is different from vllm,
# we need to override init_model_state.
vllm.v1.worker.gpu.model_runner.init_model_state = init_asecnd_model_state
logger.info_once("Wrapping init_model_state with init_asecnd_model_state.")
yield
finally:
pass