adapt to main2main for model runner v2 (#7578)

### What this PR does / why we need it?
This PR aims to adapt to newest commit of vllm main branch for model
runner v2. please refer to
https://github.com/vllm-project/vllm-ascend/issues/5208
### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?

- vLLM version: v0.18.0
- vLLM main:
ed359c497a

---------

Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
This commit is contained in:
Ronald
2026-03-25 09:08:44 +08:00
committed by GitHub
parent fc3ec100bc
commit d96440924a
16 changed files with 239 additions and 264 deletions

View File

@@ -19,6 +19,7 @@ from vllm.triton_utils import HAS_TRITON
if HAS_TRITON:
import vllm_ascend.patch.worker.patch_triton
import vllm_ascend.patch.worker.patch_v2.patch_triton # noqa
# isort: off
@@ -36,8 +37,8 @@ import vllm_ascend.patch.worker.patch_qwen3_next # noqa
import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa
import vllm_ascend.patch.worker.patch_qwen3_5 # noqa
import vllm_ascend.patch.worker.patch_rejection_sampler # noqa
import vllm_ascend.patch.worker.patch_v2_eagle # noqa
import vllm_ascend.patch.worker.patch_v2_uva # noqa
import vllm_ascend.patch.worker.patch_v2.patch_eagle # noqa
import vllm_ascend.patch.worker.patch_v2.patch_uva # noqa
import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa
import vllm_ascend.patch.worker.patch_routed_experts_capturer # noqa
import vllm_ascend.patch.worker.patch_npugraph_ex_triton # noqa
@@ -45,3 +46,6 @@ import vllm_ascend.patch.worker.patch_kimi_k25 # noqa
import vllm_ascend.patch.worker.patch_draft_quarot # noqa
import vllm_ascend.patch.worker.patch_cudagraph # noqa
import vllm_ascend.patch.worker.patch_deepseek_mtp # noqa
import vllm_ascend.patch.worker.patch_v2.patch_input_batch # noqa
import vllm_ascend.patch.worker.patch_v2.patch_model_state # noqa
import vllm_ascend.patch.worker.patch_v2.patch_block_table # noqa

View File

@@ -5,7 +5,6 @@ from vllm_ascend.ops.triton.fla.chunk import chunk_gated_delta_rule
from vllm_ascend.ops.triton.fla.layernorm_guard import LayerNormFn
from vllm_ascend.ops.triton.fla.sigmoid_gating import fused_recurrent_gated_delta_rule_fwd_kernel
from vllm_ascend.ops.triton.mamba.causal_conv1d import causal_conv1d_fn, causal_conv1d_update_npu
from vllm_ascend.worker.v2.sample.gumbel import gumbel_sample as ascend_gumbel_sample
vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal_conv1d_update_npu
vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn
@@ -14,4 +13,3 @@ vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_r
)
vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn
vllm.model_executor.layers.fla.ops.chunk_gated_delta_rule = chunk_gated_delta_rule
vllm.v1.worker.gpu.sample.gumbel.gumbel_sample = ascend_gumbel_sample

View File

@@ -0,0 +1,25 @@
# Adapt from https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu/block_table.py
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from vllm.v1.worker.gpu import model_runner
from vllm_ascend.worker.v2.block_table import AscendBlockTables
# vllm-ascend need to initialize slot mapping as torch.int32 dtype,
# but vllm default is torch.int64 dtype.
model_runner.BlockTables = AscendBlockTables

View File

@@ -0,0 +1,27 @@
# Adapt from https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu/input_batch.py
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
# 显式导入模块,确保模块被加载后再进行 patch
from vllm.v1.worker.gpu import cudagraph_utils, model_runner
from vllm_ascend.worker.v2.input_batch import AscendInputBatch
cudagraph_utils.InputBatch = AscendInputBatch
model_runner.InputBatch = AscendInputBatch

View File

@@ -0,0 +1,26 @@
# Adapt from https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu/model_states/default.py
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from vllm.v1.worker.gpu import model_runner
from vllm_ascend.worker.v2.model_states import init_asecnd_model_state
# prepare_attn in AscendModelState is different from vllm,
# we need to override init_model_state.
model_runner.init_model_state = init_asecnd_model_state

View File

@@ -0,0 +1,12 @@
from vllm.v1.worker.gpu import input_batch
from vllm.v1.worker.gpu.sample import gumbel, logprob, penalties
from vllm_ascend.worker.v2.input_batch import post_update
from vllm_ascend.worker.v2.sample.gumbel import gumbel_sample
from vllm_ascend.worker.v2.sample.logprob import compute_token_logprobs
from vllm_ascend.worker.v2.sample.penalties import apply_penalties
logprob.compute_token_logprobs = compute_token_logprobs
penalties.apply_penalties = apply_penalties
gumbel.gumbel_sample = gumbel_sample
input_batch.post_update = post_update