[CI] Upgrade vllm to 0.9.1 (#1165)

1. upgrade vllm to 0.9.1. 0.9.0 is not supported for main branch now.
keep doc to 0.9.0 until we release the first 0.9.1 release.
2. disable V0 test for PR
3. move actionlint check to lint job

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-06-11 16:33:11 +08:00
committed by GitHub
parent e46dc142bf
commit 4f5964420e
19 changed files with 72 additions and 320 deletions

View File

@@ -34,8 +34,7 @@ on:
# Current supported vLLM versions # Current supported vLLM versions
options: options:
- main - main
- v0.9.0.1 - v0.9.1
- v0.9.0
- v0.7.3 - v0.7.3
vllm-ascend-version: vllm-ascend-version:
description: 'vllm-ascend version:' description: 'vllm-ascend version:'
@@ -159,7 +158,7 @@ jobs:
repository: vllm-project/vllm repository: vllm-project/vllm
path: ./vllm-empty path: ./vllm-empty
# Please also update this when bump matched version # Please also update this when bump matched version
ref: ${{ github.event.inputs.vllm-version || 'v0.9.0' }} ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }}
- name: Install vllm-project/vllm from source - name: Install vllm-project/vllm from source
working-directory: ./vllm-empty working-directory: ./vllm-empty

View File

@@ -1,53 +0,0 @@
#
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from vllm-project/vllm/blob/main/.github
#
name: Lint GitHub Actions workflows
on:
pull_request:
branches:
- 'main'
- '*-dev'
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
- '.github/workflows/matchers/actionlint.json'
env:
LC_ALL: en_US.UTF-8
defaults:
run:
shell: bash
permissions:
contents: read
jobs:
actionlint:
runs-on: ubuntu-latest
steps:
- name: "Checkout"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: "Run actionlint"
env:
SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086
run: |
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
tools/actionlint.sh -color

View File

@@ -50,7 +50,7 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- vllm_branch: v0.9.0 - vllm_branch: v0.9.1
vllm_ascend_branch: main vllm_ascend_branch: main
container: container:
image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10

View File

@@ -33,6 +33,9 @@ on:
- '!benchmarks/**' - '!benchmarks/**'
- 'tools/mypy.sh' - 'tools/mypy.sh'
- 'mypy.ini' - 'mypy.ini'
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
- '.github/workflows/matchers/actionlint.json'
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated. # declared as "shell: bash -el {0}" on steps that need to be properly activated.
@@ -87,6 +90,13 @@ jobs:
repository: vllm-project/vllm repository: vllm-project/vllm
path: vllm-empty path: vllm-empty
- name: Actionlint Check
env:
SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086
run: |
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
tools/actionlint.sh -color
- name: Install vllm-project/vllm from source - name: Install vllm-project/vllm from source
working-directory: vllm-empty working-directory: vllm-empty
run: | run: |
@@ -105,7 +115,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-arm64-npu-1, linux-arm64-npu-4] os: [linux-arm64-npu-1, linux-arm64-npu-4]
vllm_version: [main, v0.9.0] vllm_version: [main, v0.9.1]
concurrency: concurrency:
group: > group: >
${{ ${{
@@ -193,6 +203,7 @@ jobs:
fi fi
- name: Run vllm-project/vllm-ascend test on V0 engine - name: Run vllm-project/vllm-ascend test on V0 engine
if: ${{ github.event_name == 'schedule' }}
env: env:
VLLM_USE_V1: 0 VLLM_USE_V1: 0
run: | run: |

View File

@@ -43,7 +43,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
os: [linux-arm64-npu-1, linux-arm64-npu-4] os: [linux-arm64-npu-1, linux-arm64-npu-4]
vllm_version: [main, v0.9.0] vllm_version: [main, v0.9.1]
name: vLLM Ascend long term test name: vLLM Ascend long term test
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
container: container:

View File

@@ -41,7 +41,7 @@ jobs:
if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }} if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
strategy: strategy:
matrix: matrix:
vllm_verison: [main, v0.9.0] vllm_verison: [main, v0.9.1]
name: vLLM Ascend prefilling decoding disaggregation test name: vLLM Ascend prefilling decoding disaggregation test
runs-on: linux-arm64-npu-static-8 runs-on: linux-arm64-npu-static-8

View File

@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.9.0 ARG VLLM_TAG=v0.9.1
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

View File

@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.9.0 ARG VLLM_TAG=v0.9.1
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.

View File

@@ -14,8 +14,6 @@ from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
set_current_vllm_config) set_current_vllm_config)
from vllm.utils import direct_register_custom_op from vllm.utils import direct_register_custom_op
from vllm_ascend.utils import vllm_version_is
global_counter = 0 global_counter = 0
# create a library to hold the custom op # create a library to hold the custom op
@@ -93,28 +91,14 @@ def test_simple_piecewise_compile():
model = SillyModel(vllm_config=vllm_config, prefix="") model = SillyModel(vllm_config=vllm_config, prefix="")
inputs = torch.randn(100).npu() inputs = torch.randn(100).npu()
if vllm_version_is("0.9.0"):
kwargs = { kwargs = {
"num_graphs_seen": 1, # one graph for the model "num_graphs_seen": 1, # one graph for the model
"num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1
"num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers
"num_backend_compilations": "num_backend_compilations": 3, # num_piecewise_capturable_graphs_seen
3, # num_piecewise_capturable_graphs_seen
"num_cudagraph_caputured":
6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
}
else:
kwargs = {
"num_graphs_seen": 1, # one graph for the model
"num_piecewise_graphs_seen": 5, # 2 * num_layers + 1
"num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers
"num_backend_compilations":
3, # num_piecewise_capturable_graphs_seen
"num_cudagraph_captured": "num_cudagraph_captured":
6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
} }
with compilation_counter.expect(kwargs): with compilation_counter.expect(kwargs):
model(inputs) model(inputs)

View File

@@ -31,7 +31,6 @@ from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.core.scheduler import AscendScheduler from vllm_ascend.core.scheduler import AscendScheduler
from vllm_ascend.utils import vllm_version_is
EOS_TOKEN_ID = 50256 EOS_TOKEN_ID = 50256
@@ -87,25 +86,13 @@ def create_scheduler(
vllm_config = VllmConfig(scheduler_config=scheduler_config, vllm_config = VllmConfig(scheduler_config=scheduler_config,
model_config=model_config, model_config=model_config,
cache_config=cache_config) cache_config=cache_config)
if vllm_version_is("0.9.0"):
kv_cache_config = KVCacheConfig(
num_blocks=10000, # A large number of blocks to hold all requests
tensors={},
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(16, 1, 1, torch.float32,
False))
],
)
else:
kv_cache_config = KVCacheConfig( kv_cache_config = KVCacheConfig(
num_blocks=10000, # A large number of blocks to hold all requests num_blocks=10000, # A large number of blocks to hold all requests
kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])], kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])],
kv_cache_groups=[ kv_cache_groups=[
KVCacheGroupSpec(['layer'], KVCacheGroupSpec(['layer'],
FullAttentionSpec(16, 1, 1, torch.float32, FullAttentionSpec(16, 1, 1, torch.float32, False,
False, None)) None))
], ],
) )
cache_config.num_gpu_blocks = 10000 cache_config.num_gpu_blocks = 10000
@@ -135,18 +122,6 @@ def create_requests(num_requests: int,
else: else:
mm_position = None mm_position = None
mm_inputs = None mm_inputs = None
if vllm_version_is("0.9.0"):
request = Request(
request_id=f"{i}",
prompt_token_ids=[i] * num_tokens,
sampling_params=sampling_params,
multi_modal_inputs=mm_inputs,
multi_modal_placeholders=mm_position,
multi_modal_hashes=None,
arrival_time=0,
eos_token_id=EOS_TOKEN_ID,
)
else:
request = Request( request = Request(
request_id=f"{i}", request_id=f"{i}",
prompt_token_ids=[i] * num_tokens, prompt_token_ids=[i] * num_tokens,

View File

@@ -31,8 +31,6 @@ from vllm.config import VllmConfig
from vllm.logger import logger from vllm.logger import logger
from vllm.utils import weak_ref_tensors from vllm.utils import weak_ref_tensors
from vllm_ascend.utils import vllm_version_is
@dataclasses.dataclass @dataclasses.dataclass
class ConcreteSizeEntry: class ConcreteSizeEntry:
@@ -206,10 +204,6 @@ class NPUPiecewiseBackend:
# to save memory # to save memory
entry.output = weak_ref_tensors(output) entry.output = weak_ref_tensors(output)
entry.aclgraph = aclgraph entry.aclgraph = aclgraph
if vllm_version_is("0.9.0"):
compilation_counter.num_cudagraph_caputured += 1
else:
compilation_counter.num_cudagraph_captured += 1 compilation_counter.num_cudagraph_captured += 1
# important: we need to return the output, rather than # important: we need to return the output, rather than

View File

@@ -29,8 +29,6 @@ from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is
class AscendScheduler(Scheduler): class AscendScheduler(Scheduler):
"""This Scheduler extends vllm's original v1 scheduler """This Scheduler extends vllm's original v1 scheduler
@@ -129,12 +127,7 @@ class AscendScheduler(Scheduler):
continue continue
assert num_new_tokens > 0 assert num_new_tokens > 0
if vllm_version_is("0.9.0"):
blocks = computed_blocks.blocks
else:
blocks = computed_blocks.blocks[0] blocks = computed_blocks.blocks[0]
watermark = getattr(self.scheduler_config, "watermark", 0.01) watermark = getattr(self.scheduler_config, "watermark", 0.01)
if not self._check_watermark_for_prefill(request, num_new_tokens, if not self._check_watermark_for_prefill(request, num_new_tokens,
blocks, watermark): blocks, watermark):
@@ -330,14 +323,8 @@ class AscendScheduler(Scheduler):
len(computed_blocks) * self.block_size) len(computed_blocks) * self.block_size)
num_required_blocks = cdiv(num_new_tokens + num_computed_tokens, num_required_blocks = cdiv(num_new_tokens + num_computed_tokens,
self.block_size) self.block_size)
if vllm_version_is("0.9.0"):
req_blocks = self.kv_cache_manager.single_type_manager.req_to_blocks[
request.request_id]
else:
req_blocks = self.kv_cache_manager.coordinator.get_blocks( req_blocks = self.kv_cache_manager.coordinator.get_blocks(
request.request_id) request.request_id)
num_new_blocks = (num_required_blocks - len(req_blocks) - num_new_blocks = (num_required_blocks - len(req_blocks) -
len(computed_blocks)) len(computed_blocks))
num_evictable_computed_blocks = sum(1 for blk in computed_blocks num_evictable_computed_blocks = sum(1 for blk in computed_blocks

View File

@@ -24,9 +24,9 @@
# each worker's `__init__` function. # each worker's `__init__` function.
# #
# Then in each kind of patch, there are three folders: # Then in each kind of patch, there are three folders:
# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0. # - patch_0_9_1: contains the patches applied when vllm version is 0.9.1.
# - patch_main: contains the patches applied when vllm version is main branch. # - patch_main: contains the patches applied when vllm version is main branch.
# - patch_common: contains the patches applied in both 0.9.0 and main branch. # - patch_common: contains the patches applied in both 0.9.1 and main branch.
# #
# Once a new patch is added in vllm-ascend, please add the patch description into this file as well. # Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
# ---------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------
@@ -35,17 +35,6 @@
# -------------------------------- # --------------------------------
# * Platform Patch: # * Platform Patch:
# ================= # =================
# ** File: platform/patch_0_9_0/patch_distributed.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.distributed.utils.stateless_init_torch_distributed_process_group()`
# Why:
# vllm distributed use gloo backend by default to initialize stateless process group, but we want to use hccl here
# How
# Add hccl backend to the `stateless_init_torch_distributed_process_group`
# Related PR (if no, explain why):
# https://github.com/vllm-project/vllm/pull/18763
# Future Plan:
# Remove this patch once vllm is upgraded to 0.9.1
# ** File: platform/patch_common/patch_distributed.py** # ** File: platform/patch_common/patch_distributed.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.distributed.parallel_state.destroy_model_parallel()` # 1. `vllm.distributed.parallel_state.destroy_model_parallel()`

View File

@@ -17,8 +17,8 @@
from vllm_ascend.utils import vllm_version_is from vllm_ascend.utils import vllm_version_is
# Import specific patches for different versions # Import specific patches for different versions
if vllm_version_is("0.9.0"): if vllm_version_is("0.9.1"):
from vllm_ascend.patch.platform import patch_0_9_0 # noqa: F401 from vllm_ascend.patch.platform import patch_0_9_1 # noqa: F401
from vllm_ascend.patch.platform import patch_common # noqa: F401 from vllm_ascend.patch.platform import patch_common # noqa: F401
else: else:
from vllm_ascend.patch.platform import patch_common # noqa: F401 from vllm_ascend.patch.platform import patch_common # noqa: F401

View File

@@ -1,116 +0,0 @@
import torch
from torch.distributed import ProcessGroup
from torch.distributed.distributed_c10d import (Backend, PrefixStore,
_get_default_timeout,
is_nccl_available)
from torch.distributed.rendezvous import rendezvous
from vllm.distributed import utils
def stateless_init_torch_distributed_process_group(
host: str, port: int, rank: int, world_size: int,
backend: str) -> ProcessGroup:
"""
A replacement for `torch.distributed.init_process_group` that does not
pollute the global state. The created ProcessGroup object can be used for
some operations such as `allreduce`, because it does not depend on the
global rank. However, some operations such as `broadcast` cannot be used
because it depends on the global rank.
# TODO: ask for help from PyTorch team if we need the `broadcast` operation.
This function is useful when we are not sure about the total number of
processes in the process group. For example, we may have process
1, 2, ..., 8 who want to communicate, and process 9 might be the same
process as process 1, or it might be a different process; process 10
might be the same process as process 5, or it might be a different process.
In this case, how can we reliably form a communication channel within
process 9 and 10, without affecting the communication channel within
process 1, 2, ..., 8?
One possible solution is to figure out if process 9 and 10 are the same
as process 1 and 5 beforehand, and then form a communication channel
based on the information, adjusting the ranks and world_size etc. However,
figuring out the information is not always easy, and it will interfere
with the main communication channel.
Our solution is to always form a communication channel with process 1, 2,
..., 8, and then use this function to form another communication channel
with process 9 and 10. This way, regardless of whether process 9 and 10
are the same as process 1 and 5, the main communication channel is
always formed with process 1, 2, ..., 8, and the additional communication
channel is formed with process 9 and 10.
"""
init_method = f"tcp://{host}:{port}"
backend = Backend(backend) # it is basically string
timeout = _get_default_timeout(backend)
store, rank, world_size = next(
rendezvous(init_method, rank, world_size, timeout=timeout))
store.set_timeout(timeout)
group_rank = rank
group_size = world_size
# Use a PrefixStore to avoid accidental overrides of keys used by
# different systems (e.g. RPC) in case the store is multi-tenant.
prefix_store = PrefixStore(init_method, store)
# TODO(Yizhou): The reason we need to set options while vllm does not
# seems to be related to the version of PyTorch. In the latest version,
# there is no need to set options. While in the older version, 2.5.1
# specifically, we need to set options.
options = ProcessGroup.Options(backend=backend)
pg: ProcessGroup = ProcessGroup(
prefix_store,
group_rank,
group_size,
options,
)
if backend == "gloo":
from torch.distributed.distributed_c10d import ProcessGroupGloo
backend_class = ProcessGroupGloo(prefix_store,
group_rank,
group_size,
timeout=timeout)
backend_type = ProcessGroup.BackendType.GLOO
device = torch.device("cpu")
elif backend == "nccl":
assert is_nccl_available()
from torch.distributed.distributed_c10d import ProcessGroupNCCL
backend_options = ProcessGroupNCCL.Options()
backend_options._timeout = timeout
backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
backend_options)
backend_type = ProcessGroup.BackendType.NCCL
device = torch.device("cuda")
elif backend == "hccl":
from torch.distributed import is_hccl_available
assert is_hccl_available()
from torch_npu._C._distributed_c10d import ProcessGroupHCCL
backend_options = ProcessGroupHCCL.Options()
backend_options._timeout = timeout
backend_class = ProcessGroupHCCL(prefix_store, group_rank, group_size,
backend_options)
device = torch.device("npu")
backend_class._set_sequence_number_for_group()
backend_type = ProcessGroup.BackendType.CUSTOM
pg._register_backend(device, backend_type, backend_class)
return pg
else:
raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
# TODO(Yizhou): Like we mentioned above, _set_default_backend is not
# implemented in the 2.5.1 version of PyTorch. But we need to set it
# after the latest version is released.
# pg._set_default_backend(backend_type)
backend_class._set_sequence_number_for_group()
pg._register_backend(device, backend_type, backend_class)
return pg
utils.stateless_init_torch_distributed_process_group = stateless_init_torch_distributed_process_group

View File

@@ -18,8 +18,8 @@
from vllm_ascend.utils import vllm_version_is from vllm_ascend.utils import vllm_version_is
# Import specific patches for different versions # Import specific patches for different versions
if vllm_version_is("0.9.0"): if vllm_version_is("0.9.1"):
from vllm_ascend.patch.worker import patch_0_9_0 # noqa: F401 from vllm_ascend.patch.worker import patch_0_9_1 # noqa: F401
from vllm_ascend.patch.worker import patch_common # noqa: F401 from vllm_ascend.patch.worker import patch_common # noqa: F401
else: else:
from vllm_ascend.patch.worker import patch_common # noqa: F401 from vllm_ascend.patch.worker import patch_common # noqa: F401

View File

@@ -14,4 +14,3 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import vllm_ascend.patch.platform.patch_0_9_0.patch_distributed # noqa

View File

@@ -74,7 +74,7 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata
from vllm_ascend.platform import NPUPlatform from vllm_ascend.platform import NPUPlatform
from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
from vllm_ascend.utils import ProfileExecuteDuration, vllm_version_is from vllm_ascend.utils import ProfileExecuteDuration
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
if TYPE_CHECKING: if TYPE_CHECKING:
@@ -1614,18 +1614,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
import torch_npu import torch_npu
kv_caches: Dict[str, torch.Tensor] = {} kv_caches: Dict[str, torch.Tensor] = {}
# Remove this after we drop 0.9.0 support
if vllm_version_is("0.9.0"):
self.input_batch = InputBatch(
max_num_reqs=self.max_num_reqs,
max_model_len=self.model_config.max_model_len,
max_num_batched_tokens=self.max_num_tokens,
device=self.device,
pin_memory=True,
vocab_size=self.model_config.get_vocab_size(),
block_size=self.cache_config.block_size,
)
else:
self.input_batch = InputBatch( self.input_batch = InputBatch(
max_num_reqs=self.max_num_reqs, max_num_reqs=self.max_num_reqs,
max_model_len=self.model_config.max_model_len, max_model_len=self.model_config.max_model_len,
@@ -1636,21 +1624,16 @@ class NPUModelRunner(LoRAModelRunnerMixin):
block_sizes=[self.cache_config.block_size], block_sizes=[self.cache_config.block_size],
) )
if not vllm_version_is("0.9.0"):
kv_cache_sizes = {} kv_cache_sizes = {}
for kv_cache_tensor in kv_cache_config.kv_cache_tensors: for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
assert len(kv_cache_tensor.shared_by) == 1, ( assert len(kv_cache_tensor.shared_by) == 1, (
"KV cache tensor shared by multiple layers is not supported in " "KV cache tensor shared by multiple layers is not supported in "
"NPU.") "NPU.")
kv_cache_sizes[ kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
for kv_cache_group in kv_cache_config.kv_cache_groups: for kv_cache_group in kv_cache_config.kv_cache_groups:
kv_cache_spec = kv_cache_group.kv_cache_spec kv_cache_spec = kv_cache_group.kv_cache_spec
for layer_name in kv_cache_group.layer_names: for layer_name in kv_cache_group.layer_names:
if vllm_version_is("0.9.0"):
tensor_size = kv_cache_config.tensors[layer_name].size
else:
tensor_size = kv_cache_sizes[layer_name] tensor_size = kv_cache_sizes[layer_name]
assert tensor_size % kv_cache_spec.page_size_bytes == 0 assert tensor_size % kv_cache_spec.page_size_bytes == 0
num_blocks = tensor_size // kv_cache_spec.page_size_bytes num_blocks = tensor_size // kv_cache_spec.page_size_bytes