diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 6fec0003..1b820631 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -312,7 +312,7 @@ # Future Plan: # Remove this patch when vLLM aligns with the latest processor implementation. # -# ** 10. File: worker/patch_v2_eagle.py** +# ** 10. File: worker/patch_v2/patch_eagle.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose` # Why: @@ -348,7 +348,7 @@ # Future Plan: # Remove this patch when the PTA version used by vllm-ascend has been upgraded. # -# ** 13. File: worker/patch_v2_uva.py** +# ** 13. File: worker/patch_v2/patch_uva.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.worker.gpu.states.UvaBuffer` # Why: @@ -553,3 +553,48 @@ # Future Plan: # The maybe_remap_kv_scale_name function of the community is reconstructed to support # multiple backends. +# ** 24. File: worker/patch_v2/patch_input_batch.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.v1.worker.gpu.input_batch.InputBatch` +# Why: +# vllm use InputBatch to make dummy tensors. in `model_runner.py` and `cudagraph_utils.py` +# which make it difficult to inherit from vllm methods. +# How: +# replace InputBatch with AscendInputBatch. +# Future Plan: +# remove this patch when vLLM-ascend's make_dummy behavior aligns with vLLM. +# ** 25. File: worker/patch_v2/patch_block_table.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.v1.worker.gpu.block_table.BlockTables` +# Why: +## vllm-ascend need to initialize slot mapping as torch.int32 dtype, +# but vllm default is torch.int64 dtype. +# How: +# replace BlockTables with AscendBlockTables which initialize slot mapping +# as torch.int32 dtype. +# Future Plan: +# remove this patch when vLLM-ascend's BlockTables can initialize +# slot mapping as torch.int64 dtype. +# ** 25. File: worker/patch_v2/patch_model_state.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.v1.worker.gpu.model_states.default.init_model_state` +# Why: +## vllm's prepare_attn in ModelState is different from vllm, +# we need to override init_model_state. +# How: +# Define AscendModelState and initialize it in init_model_state. +# Future Plan: +# remove this when vllm-ascend's attention metadata is align with vllm. +# ** 26. File: worker/patch_v2/patch_triton.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.v1.worker.gpu.sample.logprob`, `vllm.v1.worker.gpu.sample.penalties.apply_penalties`, +# `vllm.v1.worker.gpu.sample.gumbel.gumbel_sample` +# Why: +# triton ops in vLLM perform not good on NPU. And there is no dispatch mechanism for triton ops. +# How: +# override triton ops in vLLM with ascend implementation +# Related PR (if no, explain why): +# Let vLLM support triton ops dispatch. +# Future Plan: +# Remove this patch when vLLM support the dispatch function. +# diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index 48c7b4e6..2dcce301 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -19,6 +19,7 @@ from vllm.triton_utils import HAS_TRITON if HAS_TRITON: import vllm_ascend.patch.worker.patch_triton + import vllm_ascend.patch.worker.patch_v2.patch_triton # noqa # isort: off @@ -36,8 +37,8 @@ import vllm_ascend.patch.worker.patch_qwen3_next # noqa import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa import vllm_ascend.patch.worker.patch_qwen3_5 # noqa import vllm_ascend.patch.worker.patch_rejection_sampler # noqa -import vllm_ascend.patch.worker.patch_v2_eagle # noqa -import vllm_ascend.patch.worker.patch_v2_uva # noqa +import vllm_ascend.patch.worker.patch_v2.patch_eagle # noqa +import vllm_ascend.patch.worker.patch_v2.patch_uva # noqa import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa import vllm_ascend.patch.worker.patch_routed_experts_capturer # noqa import vllm_ascend.patch.worker.patch_npugraph_ex_triton # noqa @@ -45,3 +46,6 @@ import vllm_ascend.patch.worker.patch_kimi_k25 # noqa import vllm_ascend.patch.worker.patch_draft_quarot # noqa import vllm_ascend.patch.worker.patch_cudagraph # noqa import vllm_ascend.patch.worker.patch_deepseek_mtp # noqa +import vllm_ascend.patch.worker.patch_v2.patch_input_batch # noqa +import vllm_ascend.patch.worker.patch_v2.patch_model_state # noqa +import vllm_ascend.patch.worker.patch_v2.patch_block_table # noqa diff --git a/vllm_ascend/patch/worker/patch_triton.py b/vllm_ascend/patch/worker/patch_triton.py index a50b4c6c..3abbcb5c 100644 --- a/vllm_ascend/patch/worker/patch_triton.py +++ b/vllm_ascend/patch/worker/patch_triton.py @@ -5,7 +5,6 @@ from vllm_ascend.ops.triton.fla.chunk import chunk_gated_delta_rule from vllm_ascend.ops.triton.fla.layernorm_guard import LayerNormFn from vllm_ascend.ops.triton.fla.sigmoid_gating import fused_recurrent_gated_delta_rule_fwd_kernel from vllm_ascend.ops.triton.mamba.causal_conv1d import causal_conv1d_fn, causal_conv1d_update_npu -from vllm_ascend.worker.v2.sample.gumbel import gumbel_sample as ascend_gumbel_sample vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal_conv1d_update_npu vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn @@ -14,4 +13,3 @@ vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_r ) vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn vllm.model_executor.layers.fla.ops.chunk_gated_delta_rule = chunk_gated_delta_rule -vllm.v1.worker.gpu.sample.gumbel.gumbel_sample = ascend_gumbel_sample diff --git a/vllm_ascend/patch/worker/patch_v2/__init__.py b/vllm_ascend/patch/worker/patch_v2/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vllm_ascend/patch/worker/patch_v2/patch_block_table.py b/vllm_ascend/patch/worker/patch_v2/patch_block_table.py new file mode 100644 index 00000000..1c538ab3 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_v2/patch_block_table.py @@ -0,0 +1,25 @@ +# Adapt from https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu/block_table.py +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +from vllm.v1.worker.gpu import model_runner + +from vllm_ascend.worker.v2.block_table import AscendBlockTables + +# vllm-ascend need to initialize slot mapping as torch.int32 dtype, +# but vllm default is torch.int64 dtype. +model_runner.BlockTables = AscendBlockTables diff --git a/vllm_ascend/patch/worker/patch_v2_eagle.py b/vllm_ascend/patch/worker/patch_v2/patch_eagle.py similarity index 100% rename from vllm_ascend/patch/worker/patch_v2_eagle.py rename to vllm_ascend/patch/worker/patch_v2/patch_eagle.py diff --git a/vllm_ascend/patch/worker/patch_v2/patch_input_batch.py b/vllm_ascend/patch/worker/patch_v2/patch_input_batch.py new file mode 100644 index 00000000..3c6fac72 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_v2/patch_input_batch.py @@ -0,0 +1,27 @@ +# Adapt from https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu/input_batch.py +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + + +# 显式导入模块,确保模块被加载后再进行 patch +from vllm.v1.worker.gpu import cudagraph_utils, model_runner + +from vllm_ascend.worker.v2.input_batch import AscendInputBatch + +cudagraph_utils.InputBatch = AscendInputBatch +model_runner.InputBatch = AscendInputBatch diff --git a/vllm_ascend/patch/worker/patch_v2/patch_model_state.py b/vllm_ascend/patch/worker/patch_v2/patch_model_state.py new file mode 100644 index 00000000..985981cb --- /dev/null +++ b/vllm_ascend/patch/worker/patch_v2/patch_model_state.py @@ -0,0 +1,26 @@ +# Adapt from https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu/model_states/default.py +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from vllm.v1.worker.gpu import model_runner + +from vllm_ascend.worker.v2.model_states import init_asecnd_model_state + +# prepare_attn in AscendModelState is different from vllm, +# we need to override init_model_state. +model_runner.init_model_state = init_asecnd_model_state diff --git a/vllm_ascend/patch/worker/patch_v2/patch_triton.py b/vllm_ascend/patch/worker/patch_v2/patch_triton.py new file mode 100644 index 00000000..eecde407 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_v2/patch_triton.py @@ -0,0 +1,12 @@ +from vllm.v1.worker.gpu import input_batch +from vllm.v1.worker.gpu.sample import gumbel, logprob, penalties + +from vllm_ascend.worker.v2.input_batch import post_update +from vllm_ascend.worker.v2.sample.gumbel import gumbel_sample +from vllm_ascend.worker.v2.sample.logprob import compute_token_logprobs +from vllm_ascend.worker.v2.sample.penalties import apply_penalties + +logprob.compute_token_logprobs = compute_token_logprobs +penalties.apply_penalties = apply_penalties +gumbel.gumbel_sample = gumbel_sample +input_batch.post_update = post_update diff --git a/vllm_ascend/patch/worker/patch_v2_uva.py b/vllm_ascend/patch/worker/patch_v2/patch_uva.py similarity index 100% rename from vllm_ascend/patch/worker/patch_v2_uva.py rename to vllm_ascend/patch/worker/patch_v2/patch_uva.py diff --git a/vllm_ascend/worker/v2/README.md b/vllm_ascend/worker/v2/README.md index 1c1309e6..2e2fd232 100644 --- a/vllm_ascend/worker/v2/README.md +++ b/vllm_ascend/worker/v2/README.md @@ -5,5 +5,5 @@ This directory contains the new model runner which is under active development. please see [Model Runner V2](https://github.com/vllm-project/vllm-ascend/issues/5208) to get specific plans. -supported vllm version: main@4034c3d32e30d01639459edd3ab486f56993876d -related PR: +supported vllm version: main@ed359c497a728f08b5b41456c07a688ccd510fbc +related PR: diff --git a/vllm_ascend/worker/v2/aclgraph_utils.py b/vllm_ascend/worker/v2/aclgraph_utils.py index a99a1518..889700a6 100644 --- a/vllm_ascend/worker/v2/aclgraph_utils.py +++ b/vllm_ascend/worker/v2/aclgraph_utils.py @@ -16,128 +16,68 @@ # limitations under the License. # This file is a part of the vllm-ascend project. # -from contextlib import contextmanager from typing import Any -import numpy as np import torch import torch.nn as nn -import vllm from vllm.config import VllmConfig from vllm.config.compilation import CUDAGraphMode from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import logger from vllm.v1.kv_cache_interface import KVCacheConfig -from vllm.v1.worker.gpu.attn_utils import build_slot_mappings_by_layer from vllm.v1.worker.gpu.block_table import BlockTables -from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager +from vllm.v1.worker.gpu.cudagraph_utils import BatchExecutionDescriptor, ModelCudaGraphManager from vllm.v1.worker.gpu.input_batch import InputBuffers from vllm.v1.worker.gpu.model_states.interface import ModelState from vllm.v1.worker.utils import AttentionGroup from vllm_ascend.ascend_forward_context import _EXTRA_CTX from vllm_ascend.compilation.acl_graph import set_graph_params, update_full_graph_params -from vllm_ascend.worker.v2.attn_utils import build_attn_metadata -from vllm_ascend.worker.v2.utils import torch_cuda_wrapper -class AclGraphManager(CudaGraphManager): - """ACL Graph Manager for Ascend NPUs.""" +class ModelAclGraphManager(ModelCudaGraphManager): + """ACL Model Cuda Graph Manager for Ascend NPUs.""" def __init__( self, vllm_config: VllmConfig, - use_aux_hidden_state_outputs: bool, device: torch.device, - model_runner: Any, # NPUModelRunner type, in case circular import, so we pass it as Any + cudagraph_mode: CUDAGraphMode, + decode_query_len: int, + model_runner: Any, ): + super().__init__( + vllm_config, + device, + cudagraph_mode, + decode_query_len, + ) # set model runner attribute, so we can access attributes model runner # when call `run_fullgraph` method in CudaGraphManager, # then we don't need to # copy `execute_model` method in `NPUModelRunner` class. self.model_runner = model_runner - super().__init__( - vllm_config, - use_aux_hidden_state_outputs, - device, - ) + # capture_sizes sorts in ascending order. + self.capture_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) # vllm-ascend need to update graph params of attention backend. # so we need to set graph params before capture full graph. if super().needs_capture(): - set_graph_params(self.cudagraph_sizes) + set_graph_params(self.capture_sizes) - def _capture_full_graph( - self, - num_tokens: int, - num_reqs: int, - model: nn.Module, - input_ids: torch.Tensor, - positions: torch.Tensor, - inputs_embeds: torch.Tensor | None, - num_tokens_across_dp: torch.Tensor, - attn_metadata: dict[str, Any] | None, - slot_mappings: dict[str, torch.Tensor] | None, - has_lora: bool = False, - ) -> None: - """Override _capture_full_graph because we need to set capturing=True in forward context.""" - # set capturing=True in before model forward. - model = ModelWithContext(model) - return super()._capture_full_graph( - num_tokens, - num_reqs, - model, - input_ids, - positions, - inputs_embeds, - num_tokens_across_dp, - attn_metadata, - slot_mappings, - has_lora, - ) - - def capture_graph( - self, - num_tokens: int, - capture_cg_mode: CUDAGraphMode, - model: nn.Module, - model_state: ModelState, - input_buffers: InputBuffers, - block_tables: BlockTables, - attn_groups: list[list[AttentionGroup]], - kv_cache_config: KVCacheConfig, - has_lora: bool = False, - uniform_decode: bool = False, - ) -> None: - with torch_cuda_wrapper(), prepare_capture_inputs_wrapper(): - super().capture_graph( - num_tokens, - capture_cg_mode, - model, - model_state, - input_buffers, - block_tables, - attn_groups, - kv_cache_config, - has_lora, - uniform_decode, - ) - - def run_fullgraph(self, num_tokens: int) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: + def run_fullgraph(self, desc: BatchExecutionDescriptor) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: """Override run_fullgraph to update full graph params in run_fullgraph.""" + num_tokens = desc.num_tokens logger.info_once(f"run_fullgraph with num_tokens={num_tokens}") - ret = super().run_fullgraph(num_tokens) - assert self.model_runner.cudagraph_and_dp_padding is not None + ret = super().run_fullgraph(desc) positions = self.model_runner.input_buffers.positions[:num_tokens] - _num_tokens_after_padding, num_tokens_across_dp, synced_cudagraph_mode = ( - self.model_runner.cudagraph_and_dp_padding - ) - cudagraph_runtime_mode = CUDAGraphMode(synced_cudagraph_mode) - + # refer to vllm.v1.worker.gpu.dp_utils.sync_cudagraph_and_dp_padding to + # calculate num_tokens_across_dp. + num_tokens_across_dp = torch.full([self.model_runner.dp_size], num_tokens, device=self.device) with set_forward_context( self.model_runner.input_batch.attn_metadata, self.vllm_config, num_tokens=num_tokens, - cudagraph_runtime_mode=cudagraph_runtime_mode, + cudagraph_runtime_mode=desc.cg_mode, num_tokens_across_dp=num_tokens_across_dp, batch_descriptor=None, # Full graph model don't need batch_descriptor slot_mapping=self.model_runner.input_batch.slot_mappings, @@ -155,79 +95,31 @@ class AclGraphManager(CudaGraphManager): ) return ret - def is_uniform_decode( + def capture( self, - num_reqs: int, - num_tokens: int, - max_query_len: int, - ): - return (max_query_len == self.uniform_decode_query_len) and (num_tokens == max_query_len * num_reqs) - - -@contextmanager -def prepare_capture_inputs_wrapper(): - """Context manager to override input preparation for NPU graph capture.""" - # TODO(Ronald1995): make prepare_inputs_to_capture as static method - # in CudaGraphManager. - ori = vllm.v1.worker.gpu.cudagraph_utils.prepare_inputs_to_capture - try: - vllm.v1.worker.gpu.cudagraph_utils.prepare_inputs_to_capture = prepare_inputs_to_capture - yield - finally: - vllm.v1.worker.gpu.cudagraph_utils.prepare_inputs_to_capture = ori - - -def prepare_inputs_to_capture( - num_reqs: int, - num_tokens: int, - input_buffers: InputBuffers, - block_tables: BlockTables, - attn_groups: list[list[AttentionGroup]], - max_model_len: int, - kv_cache_config: KVCacheConfig, - uniform_decode_query_len: int = 0, -) -> tuple[dict[str, Any], dict[str, torch.Tensor]]: - if uniform_decode_query_len > 0: - num_tokens_per_req = uniform_decode_query_len - else: - num_tokens_per_req = num_tokens // num_reqs - - query_start_loc_np = np.arange(num_reqs + 1, dtype=np.int32) * num_tokens_per_req - query_start_loc_np[-1] = num_tokens - query_start_loc_cpu = torch.from_numpy(query_start_loc_np) - input_buffers.query_start_loc[: num_reqs + 1] = query_start_loc_cpu - input_buffers.query_start_loc[num_reqs + 1 :] = num_tokens - query_start_loc = input_buffers.query_start_loc[: num_reqs + 1] - - # HACK(woosuk): For faster warmup, we set seq_lens (GPU) to num_tokens - # rather than max_model_len. - input_buffers.seq_lens[:num_reqs] = num_tokens - input_buffers.seq_lens[num_reqs:] = 0 - input_buffers.seq_lens_cpu[:num_reqs] = num_tokens - input_buffers.seq_lens_cpu[num_reqs:] = 0 - - input_buffers.dcp_local_seq_lens[:num_reqs] = num_tokens - input_buffers.dcp_local_seq_lens[num_reqs:] = 0 - - input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables] - slot_mappings = block_tables.slot_mappings[:, :num_tokens] - slot_mappings_by_layer = build_slot_mappings_by_layer(slot_mappings, kv_cache_config) - - attn_metadata = build_attn_metadata( - attn_groups=attn_groups, - num_reqs=num_reqs, - num_tokens=num_tokens, - query_start_loc_gpu=query_start_loc, - query_start_loc_cpu=query_start_loc_cpu, - max_query_len=num_tokens_per_req, - seq_lens=input_buffers.seq_lens, - max_seq_len=max_model_len, - block_tables=input_block_tables, - slot_mappings=slot_mappings, - kv_cache_config=kv_cache_config, - seq_lens_np=input_buffers.seq_lens_np, - ) - return attn_metadata, slot_mappings_by_layer + model: nn.Module, + model_state: ModelState, + input_buffers: InputBuffers, + block_tables: BlockTables, + attn_groups: list[list[AttentionGroup]], + kv_cache_config: KVCacheConfig, + has_lora: bool = False, + use_aux_hidden_state_outputs: bool = False, + progress_bar_desc: str = "Capturing CUDA graphs", + ) -> None: + """Capture CUDA graphs for model forward pass.""" + model = ModelWithContext(model) + return super().capture( + model, + model_state, + input_buffers, + block_tables, + attn_groups, + kv_cache_config, + has_lora, + use_aux_hidden_state_outputs, + progress_bar_desc, + ) class ModelWithContext(nn.Module): @@ -242,6 +134,7 @@ class ModelWithContext(nn.Module): def forward(self, *args, **kwargs): # In warmup phase, capturing=False by default. # when capturing, we need to set capturing=True in forward context. - _EXTRA_CTX.capturing = True + if torch.npu.is_current_stream_capturing(): + _EXTRA_CTX.capturing = True return self.original_model(*args, **kwargs) diff --git a/vllm_ascend/worker/v2/input_batch.py b/vllm_ascend/worker/v2/input_batch.py index 1c8e78d2..24d9f375 100644 --- a/vllm_ascend/worker/v2/input_batch.py +++ b/vllm_ascend/worker/v2/input_batch.py @@ -79,14 +79,12 @@ class AscendInputBatch(InputBatch): num_reqs: int, num_tokens: int, input_buffers: AscendInputBuffers, - device: torch.device, ) -> "AscendInputBatch": """Override the make_dummy method to calculate seq_lens_np.""" input_batch = InputBatch.make_dummy( num_reqs, num_tokens, input_buffers, - device, ) # seq_len equals to query_len input_buffers.seq_lens_np[:num_reqs] = num_tokens // num_reqs diff --git a/vllm_ascend/worker/v2/model_runner.py b/vllm_ascend/worker/v2/model_runner.py index 76fd613a..de519afc 100644 --- a/vllm_ascend/worker/v2/model_runner.py +++ b/vllm_ascend/worker/v2/model_runner.py @@ -17,17 +17,13 @@ # This file is a part of the vllm-ascend project. # -import functools - import numpy as np import torch -import vllm from vllm.config import VllmConfig from vllm.config.compilation import CUDAGraphMode -from vllm.sequence import IntermediateTensors from vllm.v1.core.sched.output import SchedulerOutput -from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.worker.gpu.buffer_utils import async_copy_to_gpu +from vllm.v1.worker.gpu.cudagraph_utils import BatchExecutionDescriptor from vllm.v1.worker.gpu.input_batch import ( combine_sampled_and_draft_tokens, expand_idx_mapping, @@ -38,21 +34,21 @@ from vllm.v1.worker.gpu.model_runner import GPUModelRunner from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.utils import set_weight_prefetch_method -from vllm_ascend.worker.v2.aclgraph_utils import AclGraphManager +from vllm_ascend.worker.v2.aclgraph_utils import ModelAclGraphManager from vllm_ascend.worker.v2.attn_utils import build_attn_state from vllm_ascend.worker.v2.input_batch import AscendInputBatch, AscendInputBuffers from vllm_ascend.worker.v2.sample.sampler import AscendSampler from vllm_ascend.worker.v2.spec_decode import init_speculator from vllm_ascend.worker.v2.spec_decode.eagle import AscendEagleSpeculator from vllm_ascend.worker.v2.states import AscendRequestState -from vllm_ascend.worker.v2.utils import block_table_wrapper, model_states_wrapper, torch_cuda_wrapper +from vllm_ascend.worker.v2.utils import torch_cuda_wrapper class NPUModelRunner(GPUModelRunner): """Model runner for Ascend NPUs.""" def __init__(self, vllm_config: VllmConfig, device: torch.device): - with torch_cuda_wrapper(), block_table_wrapper(), model_states_wrapper(): + with torch_cuda_wrapper(): super().__init__(vllm_config, device) # because we will override these attribute, delete these attribute to @@ -64,11 +60,12 @@ class NPUModelRunner(GPUModelRunner): del self.speculator # NPU specific initializations can be added below. - self.cudagraph_manager: AclGraphManager = AclGraphManager( + self.cudagraph_manager: ModelAclGraphManager = ModelAclGraphManager( self.vllm_config, - self.use_aux_hidden_state_outputs, self.device, - self, + self.compilation_config.cudagraph_mode, + decode_query_len=self.decode_query_len, + model_runner=self, ) # we define AscendEagleSpeculator in vllm_ascend.worker.v2.spec_decode.eagle @@ -138,50 +135,17 @@ class NPUModelRunner(GPUModelRunner): # so we can inherit `execute_model` method. self.input_batch: AscendInputBatch | None = None - @torch.inference_mode() - def execute_model( - self, - scheduler_output: SchedulerOutput, - intermediate_tensors: IntermediateTensors | None = None, - dummy_run: bool = False, - skip_attn_for_dummy_run: bool = False, - ) -> ModelRunnerOutput | IntermediateTensors | None: - """Override GPUModelRunner.execute_model for Ascend NPUs by there reasons: - 1. when run fullgraph, we need to use ret value of `get_cudagraph_and_dp_padding` - to set forward_context in `run_fullgraph`. - """ - - # use closure to store return value of get_cudagraph_and_dp_padding in model runner. - def wrapper(func): - @functools.wraps(func) - def inner(*args, **kwargs): - self.cudagraph_and_dp_padding = func(*args, **kwargs) - return self.cudagraph_and_dp_padding - - return inner - - if self.cudagraph_and_dp_padding is None: - vllm.v1.worker.gpu.model_runner.get_cudagraph_and_dp_padding = wrapper( - vllm.v1.worker.gpu.model_runner.get_cudagraph_and_dp_padding - ) - - return super().execute_model( - scheduler_output, - intermediate_tensors, - dummy_run, - skip_attn_for_dummy_run, - ) - def prepare_inputs( self, scheduler_output: SchedulerOutput, - num_tokens_after_padding: int, + batch_desc: BatchExecutionDescriptor, ) -> AscendInputBatch: """Override GPUModelRunner.prepare_inputs for Ascend NPUs. npu attention backends need seq_lens_cpu to work. so we need to prepare seq_lens_cpu here. """ num_tokens = scheduler_output.total_num_scheduled_tokens + num_tokens_after_padding = batch_desc.num_tokens assert num_tokens > 0 num_tokens_per_req = scheduler_output.num_scheduled_tokens num_reqs = len(num_tokens_per_req) @@ -247,6 +211,7 @@ class NPUModelRunner(GPUModelRunner): # Get query_start_loc. # NOTE: For FULL mode we change +1 to +2 to reserve extra space for padding. # See _pad_query_start_loc_for_fia. + num_reqs_padded = batch_desc.num_reqs or num_reqs query_start_loc_np = np.empty(self.max_num_reqs + 2, dtype=np.int32) query_start_loc_np[0] = 0 np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1 : num_reqs + 1]) @@ -256,11 +221,12 @@ class NPUModelRunner(GPUModelRunner): # This is only required for vllm-ascend. query_start_loc_np, num_reqs_padded = self._pad_query_start_loc_for_fia( - num_tokens_padded=num_tokens_after_padding, - num_tokens=num_tokens, - num_reqs=num_reqs, - query_start_loc_np=query_start_loc_np, - max_query_len=max(scheduler_output.num_scheduled_tokens.values()), + num_tokens_after_padding, + num_reqs_padded, + num_reqs, + query_start_loc_np, + batch_desc.cg_mode, + batch_desc.num_reqs, ) async_copy_to_gpu(query_start_loc_np, out=self.input_buffers.query_start_loc) @@ -311,7 +277,8 @@ class NPUModelRunner(GPUModelRunner): self.input_batch = AscendInputBatch( req_ids=req_ids, - num_reqs=num_reqs_padded, + num_reqs=num_reqs, + num_reqs_after_padding=num_reqs_padded, idx_mapping=idx_mapping, idx_mapping_np=idx_mapping_np, expanded_idx_mapping=expanded_idx_mapping, @@ -394,37 +361,34 @@ class NPUModelRunner(GPUModelRunner): def _pad_query_start_loc_for_fia( self, num_tokens_padded: int, - num_tokens: int, + num_reqs_padded: int, num_reqs: int, query_start_loc_np: np.ndarray, - max_query_len: int, + cudagraph_runtime_mode: CUDAGraphMode | None = None, + batch_desc_num_reqs: int | None = None, ) -> tuple[np.ndarray, int]: """ This function is only designed to satisfied the constraint that when the layout is TND, the first dimension of `hidden_states` must equal the last element of `actual_seq_lengths_q`. """ - assert self.cudagraph_and_dp_padding is not None - _num_tokens_after_padding, _num_tokens_across_dp, synced_cudagraph_mode = self.cudagraph_and_dp_padding - cudagraph_runtime_mode = CUDAGraphMode(synced_cudagraph_mode) - if cudagraph_runtime_mode != CUDAGraphMode.FULL: - return query_start_loc_np, num_reqs - uniform_decode_query_len = self.cudagraph_manager.uniform_decode_query_len - is_uniform_decode = self.cudagraph_manager.is_uniform_decode( - num_reqs=num_reqs, - num_tokens=num_tokens, - max_query_len=max_query_len, - ) - if is_uniform_decode: + # TODO: need refactor later, related to vllm PR #34043 this pr delete func + # relax_for_mixed_batch_cudagraphs, num_reqs no longer equals the actual number of requests. + if cudagraph_runtime_mode == CUDAGraphMode.FULL: + num_reqs_padded = num_reqs + else: + num_reqs_padded = batch_desc_num_reqs if batch_desc_num_reqs is not None else num_reqs + + if num_tokens_padded == num_reqs_padded * self.decode_query_len: # Uniform-batch case: num_reqs must be no greater than num_reqs_padded - num_reqs_padded = num_tokens_padded // uniform_decode_query_len + assert num_reqs <= num_reqs_padded last_loc = query_start_loc_np[num_reqs] query_start_loc_np[num_reqs + 1 : num_reqs_padded + 1] = ( - np.arange(1, num_reqs_padded + 1 - num_reqs) * uniform_decode_query_len + last_loc + np.arange(1, num_reqs_padded + 1 - num_reqs) * self.decode_query_len + last_loc ) else: # Mixed-batch case: num_reqs must equal num_reqs_padded - num_reqs_padded = min(num_tokens_padded, self.max_num_reqs) + assert num_reqs == num_reqs_padded # Insert a dummy request instead of setting query_start_loc[num_reqs] = num_tokens_padded directly query_start_loc_np[num_reqs_padded + 1] = num_tokens_padded diff --git a/vllm_ascend/worker/v2/model_states/default.py b/vllm_ascend/worker/v2/model_states/default.py index bde4d7dc..cece7d18 100644 --- a/vllm_ascend/worker/v2/model_states/default.py +++ b/vllm_ascend/worker/v2/model_states/default.py @@ -20,6 +20,7 @@ from typing import Any import torch +from vllm.config.compilation import CUDAGraphMode from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.worker.gpu.model_states.default import DefaultModelState from vllm.v1.worker.utils import AttentionGroup @@ -34,18 +35,28 @@ class AscendModelState(DefaultModelState): def prepare_attn( self, input_batch: AscendInputBatch, + cudagraph_mode: CUDAGraphMode, block_tables: tuple[torch.Tensor, ...], slot_mappings: torch.Tensor, attn_groups: list[list[AttentionGroup]], kv_cache_config: KVCacheConfig, + for_capture: bool = False, ) -> dict[str, Any]: """Override prepare_attn method because `build_attn_metadata` is different from vllm.""" + if cudagraph_mode == CUDAGraphMode.FULL: + # Use padded sizes - padding is handled by model_runner.prepare_attn. + num_reqs = input_batch.num_reqs_after_padding + num_tokens = input_batch.num_tokens_after_padding + else: + # For piecewise cudagraphs and eager, use unpadded sizes. + num_reqs = input_batch.num_reqs + num_tokens = input_batch.num_tokens query_start_loc_cpu = torch.from_numpy(input_batch.query_start_loc_np) max_query_len = input_batch.num_scheduled_tokens.max().item() attn_metadata = build_attn_metadata( attn_groups=attn_groups, - num_reqs=input_batch.num_reqs, - num_tokens=input_batch.num_tokens, + num_reqs=num_reqs, + num_tokens=num_tokens, query_start_loc_gpu=input_batch.query_start_loc, query_start_loc_cpu=query_start_loc_cpu, max_query_len=max_query_len, diff --git a/vllm_ascend/worker/v2/utils.py b/vllm_ascend/worker/v2/utils.py index 349002b8..b17f3bb3 100644 --- a/vllm_ascend/worker/v2/utils.py +++ b/vllm_ascend/worker/v2/utils.py @@ -1,12 +1,8 @@ from contextlib import contextmanager import torch -import vllm from vllm.logger import logger -from vllm_ascend.worker.v2.block_table import AscendBlockTables -from vllm_ascend.worker.v2.model_states import init_asecnd_model_state - @contextmanager def torch_cuda_wrapper(): @@ -27,27 +23,3 @@ def torch_cuda_wrapper(): yield finally: pass - - -@contextmanager -def block_table_wrapper(): - try: - # vllm-ascend need to initialize slot mapping as torch.int32 dtype, - # but vllm default is torch.int64 dtype. - vllm.v1.worker.gpu.model_runner.BlockTables = AscendBlockTables - logger.info_once("Wrapping BlockTables with AscendBlockTables.") - yield - finally: - pass - - -@contextmanager -def model_states_wrapper(): - try: - # prepare_attn in AscendModelState is different from vllm, - # we need to override init_model_state. - vllm.v1.worker.gpu.model_runner.init_model_state = init_asecnd_model_state - logger.info_once("Wrapping init_model_state with init_asecnd_model_state.") - yield - finally: - pass