From 4f5964420e271b2ed7e6cd89cbad2bb66b169114 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Wed, 11 Jun 2025 16:33:11 +0800 Subject: [PATCH] [CI] Upgrade vllm to 0.9.1 (#1165) 1. upgrade vllm to 0.9.1. 0.9.0 is not supported for main branch now. keep doc to 0.9.0 until we release the first 0.9.1 release. 2. disable V0 test for PR 3. move actionlint check to lint job Signed-off-by: wangxiyuan --- .github/workflows/accuracy_test.yaml | 5 +- .github/workflows/actionlint.yml | 53 -------- .github/workflows/nightly_benchmarks.yaml | 2 +- .github/workflows/vllm_ascend_test.yaml | 13 +- .../workflows/vllm_ascend_test_long_term.yaml | 2 +- .github/workflows/vllm_ascend_test_pd.yaml | 2 +- Dockerfile | 2 +- Dockerfile.openEuler | 2 +- tests/singlecard/compile/test_simple.py | 32 ++--- tests/singlecard/test_scheduler.py | 61 +++------ vllm_ascend/compilation/piecewise_backend.py | 8 +- vllm_ascend/core/scheduler.py | 19 +-- vllm_ascend/patch/__init__.py | 15 +-- vllm_ascend/patch/platform/__init__.py | 4 +- .../platform/patch_0_9_0/patch_distributed.py | 116 ------------------ .../patch_0_9_1}/__init__.py | 0 vllm_ascend/patch/worker/__init__.py | 4 +- .../patch_0_9_1}/__init__.py | 1 - vllm_ascend/worker/model_runner_v1.py | 51 +++----- 19 files changed, 72 insertions(+), 320 deletions(-) delete mode 100644 .github/workflows/actionlint.yml delete mode 100644 vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py rename vllm_ascend/patch/{worker/patch_0_9_0 => platform/patch_0_9_1}/__init__.py (100%) rename vllm_ascend/patch/{platform/patch_0_9_0 => worker/patch_0_9_1}/__init__.py (90%) diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index d1a6123..999fb6a 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -34,8 +34,7 @@ on: # Current supported vLLM versions options: - main - - v0.9.0.1 - - v0.9.0 + - v0.9.1 - v0.7.3 vllm-ascend-version: description: 'vllm-ascend version:' @@ -159,7 +158,7 @@ jobs: repository: vllm-project/vllm path: ./vllm-empty # Please also update this when bump matched version - ref: ${{ github.event.inputs.vllm-version || 'v0.9.0' }} + ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }} - name: Install vllm-project/vllm from source working-directory: ./vllm-empty diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml deleted file mode 100644 index 91cd9c4..0000000 --- a/.github/workflows/actionlint.yml +++ /dev/null @@ -1,53 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: Lint GitHub Actions workflows -on: - pull_request: - branches: - - 'main' - - '*-dev' - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run actionlint" - env: - SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086 - run: | - echo "::add-matcher::.github/workflows/matchers/actionlint.json" - tools/actionlint.sh -color diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index da4dbcc..6ee1b45 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -50,7 +50,7 @@ jobs: strategy: matrix: include: - - vllm_branch: v0.9.0 + - vllm_branch: v0.9.1 vllm_ascend_branch: main container: image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index b023502..073058d 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -33,6 +33,9 @@ on: - '!benchmarks/**' - 'tools/mypy.sh' - 'mypy.ini' + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. @@ -87,6 +90,13 @@ jobs: repository: vllm-project/vllm path: vllm-empty + - name: Actionlint Check + env: + SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086 + run: | + echo "::add-matcher::.github/workflows/matchers/actionlint.json" + tools/actionlint.sh -color + - name: Install vllm-project/vllm from source working-directory: vllm-empty run: | @@ -105,7 +115,7 @@ jobs: max-parallel: 2 matrix: os: [linux-arm64-npu-1, linux-arm64-npu-4] - vllm_version: [main, v0.9.0] + vllm_version: [main, v0.9.1] concurrency: group: > ${{ @@ -193,6 +203,7 @@ jobs: fi - name: Run vllm-project/vllm-ascend test on V0 engine + if: ${{ github.event_name == 'schedule' }} env: VLLM_USE_V1: 0 run: | diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index 2cc8917..c17200a 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -43,7 +43,7 @@ jobs: max-parallel: 2 matrix: os: [linux-arm64-npu-1, linux-arm64-npu-4] - vllm_version: [main, v0.9.0] + vllm_version: [main, v0.9.1] name: vLLM Ascend long term test runs-on: ${{ matrix.os }} container: diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml index 7548b07..c2c76c9 100644 --- a/.github/workflows/vllm_ascend_test_pd.yaml +++ b/.github/workflows/vllm_ascend_test_pd.yaml @@ -41,7 +41,7 @@ jobs: if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }} strategy: matrix: - vllm_verison: [main, v0.9.0] + vllm_verison: [main, v0.9.1] name: vLLM Ascend prefilling decoding disaggregation test runs-on: linux-arm64-npu-static-8 diff --git a/Dockerfile b/Dockerfile index 1dfd10c..952e77f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.0 +ARG VLLM_TAG=v0.9.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index ffd1174..2ff3d0b 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.0 +ARG VLLM_TAG=v0.9.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/tests/singlecard/compile/test_simple.py b/tests/singlecard/compile/test_simple.py index 64d4cba..70b8929 100644 --- a/tests/singlecard/compile/test_simple.py +++ b/tests/singlecard/compile/test_simple.py @@ -14,8 +14,6 @@ from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, set_current_vllm_config) from vllm.utils import direct_register_custom_op -from vllm_ascend.utils import vllm_version_is - global_counter = 0 # create a library to hold the custom op @@ -93,28 +91,14 @@ def test_simple_piecewise_compile(): model = SillyModel(vllm_config=vllm_config, prefix="") inputs = torch.randn(100).npu() - - if vllm_version_is("0.9.0"): - kwargs = { - "num_graphs_seen": 1, # one graph for the model - "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 - "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers - "num_backend_compilations": - 3, # num_piecewise_capturable_graphs_seen - "num_cudagraph_caputured": - 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - } - else: - kwargs = { - "num_graphs_seen": 1, # one graph for the model - "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 - "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers - "num_backend_compilations": - 3, # num_piecewise_capturable_graphs_seen - "num_cudagraph_captured": - 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - } - + kwargs = { + "num_graphs_seen": 1, # one graph for the model + "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 + "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers + "num_backend_compilations": 3, # num_piecewise_capturable_graphs_seen + "num_cudagraph_captured": + 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + } with compilation_counter.expect(kwargs): model(inputs) diff --git a/tests/singlecard/test_scheduler.py b/tests/singlecard/test_scheduler.py index d1c6062..8021f03 100644 --- a/tests/singlecard/test_scheduler.py +++ b/tests/singlecard/test_scheduler.py @@ -31,7 +31,6 @@ from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager from vllm_ascend.core.scheduler import AscendScheduler -from vllm_ascend.utils import vllm_version_is EOS_TOKEN_ID = 50256 @@ -87,27 +86,15 @@ def create_scheduler( vllm_config = VllmConfig(scheduler_config=scheduler_config, model_config=model_config, cache_config=cache_config) - - if vllm_version_is("0.9.0"): - kv_cache_config = KVCacheConfig( - num_blocks=10000, # A large number of blocks to hold all requests - tensors={}, - kv_cache_groups=[ - KVCacheGroupSpec(['layer'], - FullAttentionSpec(16, 1, 1, torch.float32, - False)) - ], - ) - else: - kv_cache_config = KVCacheConfig( - num_blocks=10000, # A large number of blocks to hold all requests - kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])], - kv_cache_groups=[ - KVCacheGroupSpec(['layer'], - FullAttentionSpec(16, 1, 1, torch.float32, - False, None)) - ], - ) + kv_cache_config = KVCacheConfig( + num_blocks=10000, # A large number of blocks to hold all requests + kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])], + kv_cache_groups=[ + KVCacheGroupSpec(['layer'], + FullAttentionSpec(16, 1, 1, torch.float32, False, + None)) + ], + ) cache_config.num_gpu_blocks = 10000 return AscendScheduler( vllm_config, @@ -135,27 +122,15 @@ def create_requests(num_requests: int, else: mm_position = None mm_inputs = None - if vllm_version_is("0.9.0"): - request = Request( - request_id=f"{i}", - prompt_token_ids=[i] * num_tokens, - sampling_params=sampling_params, - multi_modal_inputs=mm_inputs, - multi_modal_placeholders=mm_position, - multi_modal_hashes=None, - arrival_time=0, - eos_token_id=EOS_TOKEN_ID, - ) - else: - request = Request( - request_id=f"{i}", - prompt_token_ids=[i] * num_tokens, - sampling_params=sampling_params, - multi_modal_inputs=mm_inputs, - multi_modal_placeholders=mm_position, - multi_modal_hashes=None, - eos_token_id=EOS_TOKEN_ID, - ) + request = Request( + request_id=f"{i}", + prompt_token_ids=[i] * num_tokens, + sampling_params=sampling_params, + multi_modal_inputs=mm_inputs, + multi_modal_placeholders=mm_position, + multi_modal_hashes=None, + eos_token_id=EOS_TOKEN_ID, + ) requests.append(request) return requests diff --git a/vllm_ascend/compilation/piecewise_backend.py b/vllm_ascend/compilation/piecewise_backend.py index 95ce693..c6a800b 100644 --- a/vllm_ascend/compilation/piecewise_backend.py +++ b/vllm_ascend/compilation/piecewise_backend.py @@ -31,8 +31,6 @@ from vllm.config import VllmConfig from vllm.logger import logger from vllm.utils import weak_ref_tensors -from vllm_ascend.utils import vllm_version_is - @dataclasses.dataclass class ConcreteSizeEntry: @@ -206,11 +204,7 @@ class NPUPiecewiseBackend: # to save memory entry.output = weak_ref_tensors(output) entry.aclgraph = aclgraph - - if vllm_version_is("0.9.0"): - compilation_counter.num_cudagraph_caputured += 1 - else: - compilation_counter.num_cudagraph_captured += 1 + compilation_counter.num_cudagraph_captured += 1 # important: we need to return the output, rather than # the weak ref of the output, so that pytorch can correctly diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index 42f5d9c..05c663f 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -29,8 +29,6 @@ from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager -from vllm_ascend.utils import vllm_version_is - class AscendScheduler(Scheduler): """This Scheduler extends vllm's original v1 scheduler @@ -129,12 +127,7 @@ class AscendScheduler(Scheduler): continue assert num_new_tokens > 0 - - if vllm_version_is("0.9.0"): - blocks = computed_blocks.blocks - else: - blocks = computed_blocks.blocks[0] - + blocks = computed_blocks.blocks[0] watermark = getattr(self.scheduler_config, "watermark", 0.01) if not self._check_watermark_for_prefill(request, num_new_tokens, blocks, watermark): @@ -330,14 +323,8 @@ class AscendScheduler(Scheduler): len(computed_blocks) * self.block_size) num_required_blocks = cdiv(num_new_tokens + num_computed_tokens, self.block_size) - - if vllm_version_is("0.9.0"): - req_blocks = self.kv_cache_manager.single_type_manager.req_to_blocks[ - request.request_id] - else: - req_blocks = self.kv_cache_manager.coordinator.get_blocks( - request.request_id) - + req_blocks = self.kv_cache_manager.coordinator.get_blocks( + request.request_id) num_new_blocks = (num_required_blocks - len(req_blocks) - len(computed_blocks)) num_evictable_computed_blocks = sum(1 for blk in computed_blocks diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 4be92c2..3c24bfc 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -24,9 +24,9 @@ # each worker's `__init__` function. # # Then in each kind of patch, there are three folders: -# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0. +# - patch_0_9_1: contains the patches applied when vllm version is 0.9.1. # - patch_main: contains the patches applied when vllm version is main branch. -# - patch_common: contains the patches applied in both 0.9.0 and main branch. +# - patch_common: contains the patches applied in both 0.9.1 and main branch. # # Once a new patch is added in vllm-ascend, please add the patch description into this file as well. # ---------------------------------------------------------------------------------- @@ -35,17 +35,6 @@ # -------------------------------- # * Platform Patch: # ================= -# ** File: platform/patch_0_9_0/patch_distributed.py** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.distributed.utils.stateless_init_torch_distributed_process_group()` -# Why: -# vllm distributed use gloo backend by default to initialize stateless process group, but we want to use hccl here -# How: -# Add hccl backend to the `stateless_init_torch_distributed_process_group` -# Related PR (if no, explain why): -# https://github.com/vllm-project/vllm/pull/18763 -# Future Plan: -# Remove this patch once vllm is upgraded to 0.9.1 # ** File: platform/patch_common/patch_distributed.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.distributed.parallel_state.destroy_model_parallel()` diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index e724fe5..4ec38e3 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -17,8 +17,8 @@ from vllm_ascend.utils import vllm_version_is # Import specific patches for different versions -if vllm_version_is("0.9.0"): - from vllm_ascend.patch.platform import patch_0_9_0 # noqa: F401 +if vllm_version_is("0.9.1"): + from vllm_ascend.patch.platform import patch_0_9_1 # noqa: F401 from vllm_ascend.patch.platform import patch_common # noqa: F401 else: from vllm_ascend.patch.platform import patch_common # noqa: F401 diff --git a/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py b/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py deleted file mode 100644 index d468326..0000000 --- a/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py +++ /dev/null @@ -1,116 +0,0 @@ -import torch -from torch.distributed import ProcessGroup -from torch.distributed.distributed_c10d import (Backend, PrefixStore, - _get_default_timeout, - is_nccl_available) -from torch.distributed.rendezvous import rendezvous -from vllm.distributed import utils - - -def stateless_init_torch_distributed_process_group( - host: str, port: int, rank: int, world_size: int, - backend: str) -> ProcessGroup: - """ - A replacement for `torch.distributed.init_process_group` that does not - pollute the global state. The created ProcessGroup object can be used for - some operations such as `allreduce`, because it does not depend on the - global rank. However, some operations such as `broadcast` cannot be used - because it depends on the global rank. - - # TODO: ask for help from PyTorch team if we need the `broadcast` operation. - - This function is useful when we are not sure about the total number of - processes in the process group. For example, we may have process - 1, 2, ..., 8 who want to communicate, and process 9 might be the same - process as process 1, or it might be a different process; process 10 - might be the same process as process 5, or it might be a different process. - In this case, how can we reliably form a communication channel within - process 9 and 10, without affecting the communication channel within - process 1, 2, ..., 8? - - One possible solution is to figure out if process 9 and 10 are the same - as process 1 and 5 beforehand, and then form a communication channel - based on the information, adjusting the ranks and world_size etc. However, - figuring out the information is not always easy, and it will interfere - with the main communication channel. - - Our solution is to always form a communication channel with process 1, 2, - ..., 8, and then use this function to form another communication channel - with process 9 and 10. This way, regardless of whether process 9 and 10 - are the same as process 1 and 5, the main communication channel is - always formed with process 1, 2, ..., 8, and the additional communication - channel is formed with process 9 and 10. - """ - init_method = f"tcp://{host}:{port}" - backend = Backend(backend) # it is basically string - timeout = _get_default_timeout(backend) - - store, rank, world_size = next( - rendezvous(init_method, rank, world_size, timeout=timeout)) - store.set_timeout(timeout) - - group_rank = rank - group_size = world_size - - # Use a PrefixStore to avoid accidental overrides of keys used by - # different systems (e.g. RPC) in case the store is multi-tenant. - prefix_store = PrefixStore(init_method, store) - - # TODO(Yizhou): The reason we need to set options while vllm does not - # seems to be related to the version of PyTorch. In the latest version, - # there is no need to set options. While in the older version, 2.5.1 - # specifically, we need to set options. - options = ProcessGroup.Options(backend=backend) - pg: ProcessGroup = ProcessGroup( - prefix_store, - group_rank, - group_size, - options, - ) - if backend == "gloo": - from torch.distributed.distributed_c10d import ProcessGroupGloo - backend_class = ProcessGroupGloo(prefix_store, - group_rank, - group_size, - timeout=timeout) - backend_type = ProcessGroup.BackendType.GLOO - device = torch.device("cpu") - elif backend == "nccl": - assert is_nccl_available() - from torch.distributed.distributed_c10d import ProcessGroupNCCL - - backend_options = ProcessGroupNCCL.Options() - backend_options._timeout = timeout - - backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size, - backend_options) - backend_type = ProcessGroup.BackendType.NCCL - device = torch.device("cuda") - elif backend == "hccl": - from torch.distributed import is_hccl_available - assert is_hccl_available() - from torch_npu._C._distributed_c10d import ProcessGroupHCCL - backend_options = ProcessGroupHCCL.Options() - backend_options._timeout = timeout - backend_class = ProcessGroupHCCL(prefix_store, group_rank, group_size, - backend_options) - device = torch.device("npu") - backend_class._set_sequence_number_for_group() - backend_type = ProcessGroup.BackendType.CUSTOM - pg._register_backend(device, backend_type, backend_class) - return pg - else: - raise RuntimeError(f"Unsupported torch distributed backend: {backend}") - - # TODO(Yizhou): Like we mentioned above, _set_default_backend is not - # implemented in the 2.5.1 version of PyTorch. But we need to set it - # after the latest version is released. - # pg._set_default_backend(backend_type) - backend_class._set_sequence_number_for_group() - - pg._register_backend(device, backend_type, backend_class) - - return pg - - -utils.stateless_init_torch_distributed_process_group = stateless_init_torch_distributed_process_group diff --git a/vllm_ascend/patch/worker/patch_0_9_0/__init__.py b/vllm_ascend/patch/platform/patch_0_9_1/__init__.py similarity index 100% rename from vllm_ascend/patch/worker/patch_0_9_0/__init__.py rename to vllm_ascend/patch/platform/patch_0_9_1/__init__.py diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index d1d3d42..3b29856 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -18,8 +18,8 @@ from vllm_ascend.utils import vllm_version_is # Import specific patches for different versions -if vllm_version_is("0.9.0"): - from vllm_ascend.patch.worker import patch_0_9_0 # noqa: F401 +if vllm_version_is("0.9.1"): + from vllm_ascend.patch.worker import patch_0_9_1 # noqa: F401 from vllm_ascend.patch.worker import patch_common # noqa: F401 else: from vllm_ascend.patch.worker import patch_common # noqa: F401 diff --git a/vllm_ascend/patch/platform/patch_0_9_0/__init__.py b/vllm_ascend/patch/worker/patch_0_9_1/__init__.py similarity index 90% rename from vllm_ascend/patch/platform/patch_0_9_0/__init__.py rename to vllm_ascend/patch/worker/patch_0_9_1/__init__.py index f0ac162..116c73c 100644 --- a/vllm_ascend/patch/platform/patch_0_9_0/__init__.py +++ b/vllm_ascend/patch/worker/patch_0_9_1/__init__.py @@ -14,4 +14,3 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import vllm_ascend.patch.platform.patch_0_9_0.patch_distributed # noqa diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 9b0a860..c358793 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -74,7 +74,7 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata from vllm_ascend.platform import NPUPlatform from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler -from vllm_ascend.utils import ProfileExecuteDuration, vllm_version_is +from vllm_ascend.utils import ProfileExecuteDuration from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer if TYPE_CHECKING: @@ -1614,44 +1614,27 @@ class NPUModelRunner(LoRAModelRunnerMixin): import torch_npu kv_caches: Dict[str, torch.Tensor] = {} - # Remove this after we drop 0.9.0 support - if vllm_version_is("0.9.0"): - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.model_config.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=True, - vocab_size=self.model_config.get_vocab_size(), - block_size=self.cache_config.block_size, - ) - else: - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.model_config.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=True, - vocab_size=self.model_config.get_vocab_size(), - block_sizes=[self.cache_config.block_size], - ) + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.model_config.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=True, + vocab_size=self.model_config.get_vocab_size(), + block_sizes=[self.cache_config.block_size], + ) - if not vllm_version_is("0.9.0"): - kv_cache_sizes = {} - for kv_cache_tensor in kv_cache_config.kv_cache_tensors: - assert len(kv_cache_tensor.shared_by) == 1, ( - "KV cache tensor shared by multiple layers is not supported in " - "NPU.") - kv_cache_sizes[ - kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size + kv_cache_sizes = {} + for kv_cache_tensor in kv_cache_config.kv_cache_tensors: + assert len(kv_cache_tensor.shared_by) == 1, ( + "KV cache tensor shared by multiple layers is not supported in " + "NPU.") + kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size for kv_cache_group in kv_cache_config.kv_cache_groups: kv_cache_spec = kv_cache_group.kv_cache_spec for layer_name in kv_cache_group.layer_names: - if vllm_version_is("0.9.0"): - tensor_size = kv_cache_config.tensors[layer_name].size - else: - tensor_size = kv_cache_sizes[layer_name] + tensor_size = kv_cache_sizes[layer_name] assert tensor_size % kv_cache_spec.page_size_bytes == 0 num_blocks = tensor_size // kv_cache_spec.page_size_bytes