From 819a4459ce4e2a42755f9cb9671c8c9ce0034092 Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Fri, 23 Jan 2026 09:45:08 +0800 Subject: [PATCH] Drop vLLM 0.13.0 support (#6069) ### What this PR does / why we need it? Drop vLLM 0.13.0 support, upgrade to 0.14.0 - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60 --------- Signed-off-by: hfadzxy --- .../workflows/_e2e_nightly_multi_node.yaml | 2 +- .github/workflows/nightly_test_a2.yaml | 4 +- .github/workflows/nightly_test_a3.yaml | 4 +- .github/workflows/pr_test_full.yaml | 2 +- .github/workflows/pr_test_light.yaml | 4 +- .../workflows/schedule_test_benchmarks.yaml | 2 +- Dockerfile | 2 +- Dockerfile.310p | 2 +- Dockerfile.310p.openEuler | 2 +- Dockerfile.a3 | 2 +- Dockerfile.a3.openEuler | 2 +- Dockerfile.openEuler | 2 +- docs/source/community/versioning_policy.md | 2 +- docs/source/conf.py | 2 +- .../4-cards/long_sequence/test_mtp.py | 10 +--- tests/ut/attention/test_mla_v1.py | 6 +- tests/ut/test_platform.py | 10 +--- tests/ut/worker/test_worker_v1.py | 6 +- vllm_ascend/attention/attention_v1.py | 41 +++++--------- .../context_parallel/attention_cp.py | 8 +-- .../attention/context_parallel/mla_cp.py | 8 +-- vllm_ascend/attention/mla_v1.py | 16 ++---- vllm_ascend/attention/sfa_v1.py | 13 +---- .../kv_transfer/kv_p2p/mooncake_connector.py | 7 +-- .../kv_p2p/mooncake_layerwise_connector.py | 7 +-- .../ascend_store/ascend_store_connector.py | 9 +-- .../cpu_offload/cpu_offload_connector.py | 13 +---- .../kv_transfer/kv_pool/ucm_connector.py | 6 +- vllm_ascend/kv_offload/cpu_npu.py | 10 +--- vllm_ascend/kv_offload/npu.py | 14 +---- vllm_ascend/ops/fused_moe/fused_moe.py | 9 +-- vllm_ascend/ops/mla.py | 9 +-- vllm_ascend/ops/mm_encoder_attention.py | 8 +-- vllm_ascend/ops/rotary_embedding.py | 25 ++------- vllm_ascend/ops/triton/mamba/causal_conv1d.py | 9 +-- vllm_ascend/patch/platform/__init__.py | 2 +- vllm_ascend/patch/worker/patch_qwen3_next.py | 9 +-- vllm_ascend/worker/model_runner_v1.py | 55 +++++-------------- vllm_ascend/worker/worker.py | 14 +---- 39 files changed, 86 insertions(+), 272 deletions(-) diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index df3d30a5..71ab5165 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -32,7 +32,7 @@ on: description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need vllm_version: required: false - default: "v0.13.0" + default: "v0.14.0" type: string description: vllm version to use vllm_ascend_remote_url: diff --git a/.github/workflows/nightly_test_a2.yaml b/.github/workflows/nightly_test_a2.yaml index f0bba92d..4817dbd1 100644 --- a/.github/workflows/nightly_test_a2.yaml +++ b/.github/workflows/nightly_test_a2.yaml @@ -66,7 +66,7 @@ jobs: tests: tests/e2e/nightly/single_node/ops/multicard_ops_a2/ uses: ./.github/workflows/_e2e_nightly_single_node.yaml with: - vllm: v0.13.0 + vllm: v0.14.0 runner: ${{ matrix.test_config.os }} tests: ${{ matrix.test_config.tests }} name: ${{ matrix.test_config.name }} @@ -137,7 +137,7 @@ jobs: - Qwen3-Omni-30B-A3B-Instruct uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml with: - vllm: v0.13.0 + vllm: v0.14.0 runner: ${{ matrix.test_config.os }} model_list: ${{ toJson(matrix.test_config.model_list) }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-910b-ubuntu22.04-py3.11' diff --git a/.github/workflows/nightly_test_a3.yaml b/.github/workflows/nightly_test_a3.yaml index b46b45bc..40ee1821 100644 --- a/.github/workflows/nightly_test_a3.yaml +++ b/.github/workflows/nightly_test_a3.yaml @@ -158,7 +158,7 @@ jobs: tests: tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py uses: ./.github/workflows/_e2e_nightly_single_node.yaml with: - vllm: v0.13.0 + vllm: v0.14.0 runner: ${{ matrix.test_config.os }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' tests: ${{ matrix.test_config.tests }} @@ -178,7 +178,7 @@ jobs: uses: ./.github/workflows/_e2e_nightly_single_node.yaml with: runner: ${{ matrix.test_config.os }} - vllm: v0.13.0 + vllm: v0.14.0 image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' tests: ${{ matrix.test_config.tests }} name: ${{ matrix.test_config.name }} diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index e012eca4..03866afa 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.13.0] + vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 729889b2..b044184d 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -84,7 +84,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.13.0] + vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -96,7 +96,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.13.0] + vllm_version: [d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_test_benchmarks.yaml b/.github/workflows/schedule_test_benchmarks.yaml index 6a7c96f9..173b671b 100644 --- a/.github/workflows/schedule_test_benchmarks.yaml +++ b/.github/workflows/schedule_test_benchmarks.yaml @@ -51,7 +51,7 @@ jobs: strategy: matrix: include: - - vllm_branch: v0.13.0 + - vllm_branch: v0.14.0 vllm_ascend_branch: main max-parallel: 1 container: diff --git a/Dockerfile b/Dockerfile index 2c43ba8b..188aace0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,7 +48,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.13.0 +ARG VLLM_TAG=v0.14.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p b/Dockerfile.310p index fe452cd5..8e03b63e 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -40,7 +40,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.13.0 +ARG VLLM_TAG=v0.14.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index fbcf0149..047cb175 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -36,7 +36,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.13.0 +ARG VLLM_TAG=v0.14.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.a3 b/Dockerfile.a3 index c6b48709..fbecd5d5 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -47,7 +47,7 @@ RUN apt-get update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.13.0 +ARG VLLM_TAG=v0.14.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index 75896e8a..6f9f9569 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -50,7 +50,7 @@ RUN yum update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.13.0 +ARG VLLM_TAG=v0.14.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index e634a3e9..c884fb9c 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -50,7 +50,7 @@ RUN yum update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.13.0 +ARG VLLM_TAG=v0.14.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 46327d25..2f88e52c 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -53,7 +53,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.13.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | d68209402ddab3f54a09bc1f4de9a9495a283b60, v0.14.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/docs/source/conf.py b/docs/source/conf.py index 1f332969..603673b8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -77,7 +77,7 @@ myst_substitutions = { # CANN image tag "cann_image_tag": "8.5.0-910b-ubuntu22.04-py3.11", # vllm version in ci - "ci_vllm_version": "v0.13.0", + "ci_vllm_version": "v0.14.0", } # For cross-file header anchors diff --git a/tests/e2e/multicard/4-cards/long_sequence/test_mtp.py b/tests/e2e/multicard/4-cards/long_sequence/test_mtp.py index 3cf269c5..78baeffb 100644 --- a/tests/e2e/multicard/4-cards/long_sequence/test_mtp.py +++ b/tests/e2e/multicard/4-cards/long_sequence/test_mtp.py @@ -21,7 +21,6 @@ import os import pytest from tests.e2e.conftest import VllmRunner -from vllm_ascend.utils import vllm_version_is os.environ["HCCL_BUFFSIZE"] = "512" @@ -51,8 +50,7 @@ def test_pcp_dcp_mtp1_eager(): runner.generate_greedy(prompts, 32) -@pytest.mark.skipif( - not vllm_version_is('0.13.0'), +@pytest.mark.skip( reason="vLLM PR-32118 break this", ) def test_pcp_dcp_mtp3_eager(): @@ -80,8 +78,7 @@ def test_pcp_dcp_mtp3_eager(): runner.generate_greedy(prompts, 32) -@pytest.mark.skipif( - not vllm_version_is('0.13.0'), +@pytest.mark.skip( reason="vLLM PR-32118 break this", ) def test_pcp_dcp_mtp3_piecewise_graph(): @@ -112,8 +109,7 @@ def test_pcp_dcp_mtp3_piecewise_graph(): runner.generate_greedy(prompts, 32) -@pytest.mark.skipif( - not vllm_version_is('0.13.0'), +@pytest.mark.skip( reason="vLLM PR-32118 break this", ) def test_pcp_dcp_mtp3_full_graph(): diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py index 4f76691b..d82218e0 100755 --- a/tests/ut/attention/test_mla_v1.py +++ b/tests/ut/attention/test_mla_v1.py @@ -17,7 +17,6 @@ from vllm_ascend.attention.mla_v1 import (AscendMLABackend, AscendMLAPrefillMetadata, ChunkedContextMetadata) from vllm_ascend.attention.utils import AscendCommonAttentionMetadata -from vllm_ascend.utils import vllm_version_is class TestAscendMLABackend(TestBase): @@ -477,10 +476,7 @@ class TestAscendMLAMetadataBuilderBuild(TestBase): self.mock_vllm_config.model_config = model_config self.kv_cache_spec = MagicMock() self.kv_cache_spec.num_layers = 32 - if vllm_version_is('0.13.0'): - self.kv_cache_spec.head_size = 128 - else: - self.kv_cache_spec.head_size = 64 + self.kv_cache_spec.head_size = 64 self.kv_cache_spec.num_heads = 32 def tearDown(self): diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index ba61744b..978a24ba 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -5,17 +5,11 @@ import pytest import torch from vllm.config.compilation import CompilationMode, CUDAGraphMode from vllm.platforms import PlatformEnum +from vllm.v1.attention.selector import AttentionSelectorConfig # type: ignore from tests.ut.base import TestBase from vllm_ascend.platform import NPUPlatform -from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD, AscendDeviceType, vllm_version_is - -# isort: off -if vllm_version_is("0.13.0"): - from vllm.attention.selector import AttentionSelectorConfig # type: ignore -else: - from vllm.v1.attention.selector import AttentionSelectorConfig # type: ignore -# isort: on +from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD, AscendDeviceType class TestNPUPlatform(TestBase): diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py index 9114764e..afb9d42a 100644 --- a/tests/ut/worker/test_worker_v1.py +++ b/tests/ut/worker/test_worker_v1.py @@ -5,7 +5,6 @@ import torch from vllm.config import CacheConfig, ModelConfig, ParallelConfig, ProfilerConfig, VllmConfig from tests.ut.base import TestBase -from vllm_ascend.utils import vllm_version_is init_cached_hf_modules_path = "vllm.utils.import_utils.init_cached_hf_modules" @@ -141,10 +140,7 @@ class TestNPUWorker(TestBase): ) # Verify init_cached_hf_modules is called (trust_remote_code=True) - if vllm_version_is('0.13.0'): - mock_init_cached_hf_modules.assert_called_once() - else: - mock_init_cached_hf_modules.assert_not_called() + mock_init_cached_hf_modules.assert_not_called() @patch("vllm_ascend.utils.adapt_patch") @patch("vllm_ascend.ops") diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 6c45b6dc..2f0a8348 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -25,6 +25,18 @@ import vllm.envs as envs_vllm from vllm.config import VllmConfig, get_current_vllm_config from vllm.forward_context import ForwardContext, get_forward_context from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backend import ( # type: ignore + AttentionBackend, + AttentionCGSupport, + AttentionImpl, + AttentionLayer, + AttentionMetadataBuilder, + AttentionType, +) +from vllm.v1.attention.backends.registry import ( # type: ignore + AttentionBackendEnum, + register_backend, +) from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import AttentionSpec, CrossAttentionSpec @@ -44,34 +56,7 @@ from vllm_ascend.compilation.acl_graph import ( ) from vllm_ascend.device.device_op import DeviceOperator from vllm_ascend.ops.flashcomm2_oshard_manager import flashcomm2_oshard_manager -from vllm_ascend.utils import vllm_version_is, weak_ref_tensors - -if vllm_version_is("0.13.0"): - from vllm.attention.backends.abstract import ( # type: ignore - AttentionBackend, - AttentionImpl, - AttentionLayer, - AttentionType, - ) - from vllm.attention.backends.registry import ( # type: ignore - AttentionBackendEnum, - register_backend, - ) - from vllm.v1.attention.backends.utils import AttentionCGSupport, AttentionMetadataBuilder -else: - from vllm.v1.attention.backend import ( # type: ignore - AttentionBackend, - AttentionCGSupport, - AttentionImpl, - AttentionLayer, - AttentionMetadataBuilder, - AttentionType, - ) - from vllm.v1.attention.backends.registry import ( # type: ignore - AttentionBackendEnum, - register_backend, - ) - +from vllm_ascend.utils import weak_ref_tensors # default max value of sliding window size SWA_INT_MAX = 2147483647 diff --git a/vllm_ascend/attention/context_parallel/attention_cp.py b/vllm_ascend/attention/context_parallel/attention_cp.py index d7cf9963..cae53590 100644 --- a/vllm_ascend/attention/context_parallel/attention_cp.py +++ b/vllm_ascend/attention/context_parallel/attention_cp.py @@ -29,6 +29,7 @@ from vllm.distributed import ( get_pcp_group, ) from vllm.forward_context import ForwardContext, get_forward_context +from vllm.v1.attention.backend import AttentionCGSupport from vllm.v1.kv_cache_interface import AttentionSpec from vllm_ascend.attention.attention_v1 import ( @@ -49,12 +50,7 @@ from vllm_ascend.attention.utils import ( split_decodes_and_prefills, ) from vllm_ascend.compilation.acl_graph import get_graph_params, update_graph_params_workspaces -from vllm_ascend.utils import cp_chunkedprefill_comm_stream, vllm_version_is, weak_ref_tensors - -if vllm_version_is("0.13.0"): - from vllm.v1.attention.backends.utils import AttentionCGSupport -else: - from vllm.v1.attention.backend import AttentionCGSupport +from vllm_ascend.utils import cp_chunkedprefill_comm_stream, weak_ref_tensors class AscendAttentionCPMetadataBuilder(AscendAttentionMetadataBuilder): diff --git a/vllm_ascend/attention/context_parallel/mla_cp.py b/vllm_ascend/attention/context_parallel/mla_cp.py index 58b64956..81298f94 100644 --- a/vllm_ascend/attention/context_parallel/mla_cp.py +++ b/vllm_ascend/attention/context_parallel/mla_cp.py @@ -12,6 +12,7 @@ from vllm.distributed import ( ) from vllm.forward_context import ForwardContext, get_forward_context from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backend import AttentionCGSupport from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec from vllm_ascend.attention.attention_v1 import AscendAttentionState @@ -37,12 +38,7 @@ from vllm_ascend.attention.context_parallel.common_cp import ( ) from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.compilation.acl_graph import get_draft_graph_params, get_graph_params, update_graph_params_workspaces -from vllm_ascend.utils import vllm_version_is, weak_ref_tensors - -if vllm_version_is("0.13.0"): - from vllm.v1.attention.backends.utils import AttentionCGSupport -else: - from vllm.v1.attention.backend import AttentionCGSupport +from vllm_ascend.utils import weak_ref_tensors MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024 diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index f695be26..9383a165 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -10,7 +10,10 @@ from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import logger from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.utils.math_utils import cdiv, round_down +from vllm.v1.attention.backend import ( # type: ignore + AttentionBackend, AttentionCGSupport, MLAAttentionImpl) from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder +from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec from vllm_ascend import envs @@ -35,23 +38,12 @@ from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, maybe_trans_nz, - vllm_version_is, weak_ref_tensors) + weak_ref_tensors) from vllm_ascend.worker.npu_input_batch import NPUInputBatch if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput -# isort: off -if vllm_version_is('0.13.0'): - from vllm.v1.attention.backends.utils import AttentionCGSupport - from vllm.attention.backends.abstract import ( # type: ignore - AttentionBackend, MLAAttentionImpl) - from vllm.attention.backends.utils import PAD_SLOT_ID # type: ignore -else: - from vllm.v1.attention.backend import ( # type: ignore - AttentionBackend, AttentionCGSupport, MLAAttentionImpl) - from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore -# isort: on MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024 BUILD_METADATA_STEP_PREFILL = 0 diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index 71ab0c0b..e36a928a 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -12,6 +12,8 @@ from vllm.forward_context import get_forward_context from vllm.logger import logger from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.triton_utils import HAS_TRITON +from vllm.v1.attention.backend import ( # type: ignore + AttentionBackend, AttentionCGSupport, MLAAttentionImpl) from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder from vllm.v1.kv_cache_interface import AttentionSpec @@ -35,20 +37,11 @@ from vllm_ascend.ops.triton.rope import rope_forward_triton from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, _round_up, dispose_layer, - enable_dsa_cp, enable_dsa_cp_with_layer_shard, maybe_trans_nz, vllm_version_is) + enable_dsa_cp, enable_dsa_cp_with_layer_shard, maybe_trans_nz) from vllm_ascend.worker.npu_input_batch import NPUInputBatch -# isort: off if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput -if vllm_version_is('0.13.0'): - from vllm.v1.attention.backends.utils import AttentionCGSupport - from vllm.attention.backends.abstract import ( # type: ignore - AttentionBackend, MLAAttentionImpl) -else: - from vllm.v1.attention.backend import ( # type: ignore - AttentionBackend, AttentionCGSupport, MLAAttentionImpl) -# isort: on # token count limits within bmm_transpose operator BMM_TRANS_MAX_SUPPORTED_TOKENS = 1024 diff --git a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py index a2323adf..f1a6d6ab 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py @@ -43,14 +43,11 @@ from vllm.v1.request import RequestStatus from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config from vllm_ascend.distributed.kv_transfer.utils.mooncake_transfer_engine import global_te from vllm_ascend.distributed.kv_transfer.utils.utils import get_transfer_timeout_value -from vllm_ascend.utils import is_vl_model, vllm_version_is +from vllm_ascend.utils import is_vl_model # isort: off if TYPE_CHECKING: - if vllm_version_is('0.13.0'): - from vllm.attention.backends.abstract import AttentionMetadata # type: ignore - else: - from vllm.attention.backends import AttentionMetadata # type: ignore + from vllm.v1.attention.backend import AttentionMetadata # type: ignore from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.request import Request diff --git a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py index 2484a415..e06881fe 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py @@ -38,14 +38,11 @@ from vllm_ascend.distributed.kv_transfer.utils.mooncake_transfer_engine import \ global_te from vllm_ascend.distributed.kv_transfer.utils.utils import ( align_memory, get_transfer_timeout_value, kv_alltoall_and_rearrange) -from vllm_ascend.utils import npu_stream_switch, vllm_version_is +from vllm_ascend.utils import npu_stream_switch # isort: off if TYPE_CHECKING: - if vllm_version_is('0.13.0'): - from vllm.attention.backends.abstract import AttentionMetadata # type: ignore - else: - from vllm.attention.backends import AttentionMetadata # type: ignore + from vllm.v1.attention.backend import AttentionMetadata # type: ignore from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.request import Request diff --git a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/ascend_store_connector.py b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/ascend_store_connector.py index 7580f59c..7661ef7a 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/ascend_store_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/ascend_store_connector.py @@ -9,6 +9,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( from vllm.forward_context import ForwardContext from vllm.logger import logger from vllm.utils.network_utils import make_zmq_socket +from vllm.v1.attention.backend import AttentionMetadata # type: ignore from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import KVCacheConfig @@ -19,14 +20,6 @@ from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.pool_scheduler imp KVPoolScheduler, get_zmq_rpc_path_lookup) from vllm_ascend.distributed.kv_transfer.kv_pool.ascend_store.pool_worker import \ KVPoolWorker -from vllm_ascend.utils import vllm_version_is - -# isort: off -if vllm_version_is('0.13.0'): - from vllm.attention.backends.abstract import AttentionMetadata # type: ignore -else: - from vllm.v1.attention.backend import AttentionMetadata # type: ignore -# isort: on class AscendStoreConnector(KVConnectorBase_V1): diff --git a/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py b/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py index 34f6b8a8..8f50cbb0 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py @@ -24,25 +24,14 @@ from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheSpec from vllm_ascend.distributed.kv_transfer.kv_pool.cpu_offload.metadata import ( MetadataServer, MetadataServerProc, MLAConfig) -from vllm_ascend.utils import vllm_version_is -# isort: off -if vllm_version_is('0.13.0'): - from vllm.attention.backends.abstract import AttentionType # type: ignore -else: - from vllm.v1.attention.backend import AttentionType # type: ignore if TYPE_CHECKING: - if vllm_version_is('0.13.0'): - from vllm.attention.backends.abstract import \ - AttentionMetadata # type: ignore - else: - from vllm.v1.attention.backend import AttentionType #type: ignore + from vllm.v1.attention.backend import AttentionMetadata #type: ignore from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request -# isort: on @dataclass diff --git a/vllm_ascend/distributed/kv_transfer/kv_pool/ucm_connector.py b/vllm_ascend/distributed/kv_transfer/kv_pool/ucm_connector.py index 4ac5e717..df3a112d 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_pool/ucm_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_pool/ucm_connector.py @@ -9,16 +9,12 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput -from vllm_ascend.utils import vllm_version_is logger = init_logger(__name__) # isort: off if TYPE_CHECKING: - if vllm_version_is('0.13.0'): - from vllm.attention.backends.abstract import AttentionMetadata # type: ignore - else: - from vllm.v1.attention.backend import AttentionMetadata # type: ignore + from vllm.v1.attention.backend import AttentionMetadata # type: ignore from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( KVConnectorPromMetrics, KVConnectorStats, PromMetric, PromMetricT) from vllm.forward_context import ForwardContext diff --git a/vllm_ascend/kv_offload/cpu_npu.py b/vllm_ascend/kv_offload/cpu_npu.py index fa80d860..13e9869d 100644 --- a/vllm_ascend/kv_offload/cpu_npu.py +++ b/vllm_ascend/kv_offload/cpu_npu.py @@ -2,19 +2,11 @@ import numpy as np import torch from vllm.logger import init_logger from vllm.utils.platform_utils import is_pin_memory_available +from vllm.v1.attention.backend import AttentionBackend # type: ignore from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.worker import (OffloadingHandler, TransferResult, TransferSpec) -from vllm_ascend.utils import vllm_version_is - -# isort: off -if vllm_version_is('0.13.0'): - from vllm.attention.backends.abstract import AttentionBackend # type: ignore -else: - from vllm.v1.attention.backend import AttentionBackend # type: ignore -# isort: on - logger = init_logger(__name__) diff --git a/vllm_ascend/kv_offload/npu.py b/vllm_ascend/kv_offload/npu.py index e0df1484..3c5d8ae0 100644 --- a/vllm_ascend/kv_offload/npu.py +++ b/vllm_ascend/kv_offload/npu.py @@ -3,6 +3,7 @@ from typing import Optional import torch from vllm.config import VllmConfig +from vllm.v1.attention.backend import AttentionBackend # type: ignore from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager from vllm.v1.kv_offload.backends.cpu import CPUBackend from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager @@ -12,14 +13,6 @@ from vllm.v1.kv_offload.worker.worker import OffloadingHandler from vllm.v1.kv_cache_interface import KVCacheConfig from vllm_ascend.kv_offload.cpu_npu import CpuNpuOffloadingHandler -from vllm_ascend.utils import vllm_version_is - -# isort: off -if vllm_version_is('0.13.0'): - from vllm.attention.backends.abstract import AttentionBackend # type: ignore -else: - from vllm.v1.attention.backend import AttentionBackend # type: ignore -# isort: on class NPUOffloadingSpec(OffloadingSpec): @@ -27,10 +20,7 @@ class NPUOffloadingSpec(OffloadingSpec): def __init__(self, vllm_config: VllmConfig, kv_cache_config: Optional[KVCacheConfig] = None): - if vllm_version_is('0.13.0'): - super().__init__(vllm_config) - else: - super().__init__(vllm_config, kv_cache_config) + super().__init__(vllm_config, kv_cache_config) num_cpu_blocks = self.extra_config.get("num_cpu_blocks") if not num_cpu_blocks: diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index 2c169f5c..864ea3c8 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -50,7 +50,7 @@ from vllm_ascend.quantization.w8a8_dynamic import \ from vllm_ascend.utils import (AscendDeviceType, enable_sp, get_ascend_device_type, maybe_trans_nz, npu_stream_switch, shared_expert_dp_enabled, - shared_experts_calculation_stream, vllm_version_is) + shared_experts_calculation_stream) @dataclass class FusedMoEResult: @@ -451,12 +451,7 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE): # Qwen3-Next specific gating mechanism if hasattr(self._shared_experts, "expert_gate") and \ self._shared_experts.expert_gate is not None: - if vllm_version_is('0.13.0'): - # TODO(jianzs): remove this branch after vLLM new version is - # released - gate_out = self._shared_experts.expert_gate(hidden_states) # type: ignore - else: - gate_out, _ = self._shared_experts.expert_gate(hidden_states) # type: ignore + gate_out, _ = self._shared_experts.expert_gate(hidden_states) # type: ignore shared_out = F.sigmoid(gate_out) * shared_out return shared_out diff --git a/vllm_ascend/ops/mla.py b/vllm_ascend/ops/mla.py index 7ae8e29e..c2a3f576 100644 --- a/vllm_ascend/ops/mla.py +++ b/vllm_ascend/ops/mla.py @@ -31,16 +31,9 @@ from vllm.model_executor.layers.mla import (MLAModules, MultiHeadLatentAttentionWrapper) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionMetadata # type: ignore from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.utils import vllm_version_is - -# isort: off -if vllm_version_is('0.13.0'): - from vllm.attention.backends.abstract import AttentionMetadata # type: ignore -else: - from vllm.v1.attention.backend import AttentionMetadata # type: ignore -# isort: on class IndexerWrapper(nn.Module): diff --git a/vllm_ascend/ops/mm_encoder_attention.py b/vllm_ascend/ops/mm_encoder_attention.py index 9ab785dd..081bc45f 100644 --- a/vllm_ascend/ops/mm_encoder_attention.py +++ b/vllm_ascend/ops/mm_encoder_attention.py @@ -20,16 +20,10 @@ import torch import torch.nn.functional as F import torch_npu from vllm.config import MultiModalConfig +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention # type: ignore import vllm_ascend.envs as envs_ascend -from vllm_ascend.utils import vllm_version_is -# isort: off -if vllm_version_is('0.13.0'): - from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention # type: ignore -else: - from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention # type: ignore -# isort: on MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 04c9d302..2f507b74 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -31,8 +31,7 @@ if HAS_TRITON: from vllm_ascend.platform import NPUPlatform from vllm_ascend.utils import (AscendDeviceType, enable_custom_op, - get_ascend_device_type, has_rope, is_vl_model, - vllm_version_is) + get_ascend_device_type, has_rope, is_vl_model) # Currently, rope ops used on npu requires detached cos && sin as inputs. # However, RotaryEmbedding in vllm use cos_sin_cache as a whole variable. @@ -637,18 +636,8 @@ class AscendApplyRotaryEmb(ApplyRotaryEmb): cos: torch.Tensor, sin: torch.Tensor, ) -> torch.Tensor: - if vllm_version_is('0.13.0'): - origin_shape = x.shape - origin_dtype = x.dtype - if len(origin_shape) == 3: - x = x.unsqueeze(0) - if self.enable_fp32_compute: - x = x.float() - cos = cos.float() - sin = sin.float() - else: - x, cos, sin, origin_shape, origin_dtype = self._pre_process( - x, cos, sin) + x, cos, sin, origin_shape, origin_dtype = self._pre_process( + x, cos, sin) head_dim = x.shape[-1] # cos, sin: [seq_len, head_dim // 2] @@ -660,12 +649,6 @@ class AscendApplyRotaryEmb(ApplyRotaryEmb): output = torch_npu.npu_rotary_mul(x, cos, sin) - if vllm_version_is('0.13.0'): - if len(origin_shape) == 3: - output = output.squeeze(0) - if self.enable_fp32_compute: - output = output.to(origin_dtype) - else: - output = self._post_process(output, origin_shape, origin_dtype) + output = self._post_process(output, origin_shape, origin_dtype) return output diff --git a/vllm_ascend/ops/triton/mamba/causal_conv1d.py b/vllm_ascend/ops/triton/mamba/causal_conv1d.py index 84c330b5..4a304d99 100644 --- a/vllm_ascend/ops/triton/mamba/causal_conv1d.py +++ b/vllm_ascend/ops/triton/mamba/causal_conv1d.py @@ -14,14 +14,7 @@ import torch.nn.functional as F import triton import triton.language as tl -from vllm_ascend.utils import vllm_version_is - -# isort: off -if vllm_version_is('0.13.0'): - from vllm.attention.backends.utils import PAD_SLOT_ID # type: ignore -else: - from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore -# isort: on +from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore def causal_conv1d_ref( diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index 49840db3..c215c059 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -27,5 +27,5 @@ if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv( "EXPERT_MAP_RECORD", "false") == "true": import vllm_ascend.patch.platform.patch_multiproc_executor # noqa -if envs.VLLM_ASCEND_BALANCE_SCHEDULING and vllm_version_is('0.13.0'): +if envs.VLLM_ASCEND_BALANCE_SCHEDULING and vllm_version_is('0.14.0'): import vllm_ascend.patch.platform.patch_balance_schedule # noqa diff --git a/vllm_ascend/patch/worker/patch_qwen3_next.py b/vllm_ascend/patch/worker/patch_qwen3_next.py index 0789b796..21c549eb 100644 --- a/vllm_ascend/patch/worker/patch_qwen3_next.py +++ b/vllm_ascend/patch/worker/patch_qwen3_next.py @@ -28,6 +28,7 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( from vllm.model_executor.models.qwen3_next import (Qwen3NextGatedDeltaNet, fused_gdn_gating) from vllm.triton_utils import triton +from vllm.v1.attention.backend import AttentionMetadata # type: ignore from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata from vllm_ascend.ops.triton.fla.fused_qkvzba_split_reshape import \ @@ -35,14 +36,6 @@ from vllm_ascend.ops.triton.fla.fused_qkvzba_split_reshape import \ from vllm_ascend.ops.triton.fla.sigmoid_gating import \ fused_sigmoid_gating_delta_rule_update from vllm_ascend.ops.triton.fused_gdn_gating import fused_gdn_gating_patch -from vllm_ascend.utils import vllm_version_is - -# isort: off -if vllm_version_is('0.13.0'): - from vllm.attention.backends.abstract import AttentionMetadata # type: ignore -else: - from vllm.v1.attention.backend import AttentionMetadata # type: ignore -# isort: on class AscendQwen3Next_GatedDeltaNet(nn.Module, MambaBase): diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index e2fb61f7..7409d21c 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -50,8 +50,10 @@ from vllm.sequence import IntermediateTensors from vllm.utils.import_utils import LazyLoader from vllm.utils.math_utils import cdiv from vllm.utils.mem_utils import DeviceMemoryProfiler +from vllm.v1.attention.backend import AttentionBackend, AttentionType # type: ignore from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.attention.selector import get_attn_backend # type: ignore from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import (AttentionSpec, EncoderOnlyAttentionSpec, @@ -102,7 +104,7 @@ from vllm_ascend.spec_decode.mtp_proposer import MtpProposer from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration, enable_sp, get_ascend_device_type, is_moe_model, lmhead_tp_enable, maybe_trans_nz, - set_weight_prefetch_method, vllm_version_is) + set_weight_prefetch_method) from vllm_ascend.worker.npu_input_batch import NPUInputBatch from vllm_ascend.worker.pcp_utils import PCPManager @@ -115,15 +117,6 @@ if TYPE_CHECKING: else: xgr = LazyLoader("xgr", globals(), "xgrammar") -# isort: off -if vllm_version_is('0.13.0'): - from vllm.attention.backends.abstract import ( # type: ignore - AttentionBackend, AttentionType) - from vllm.attention.selector import get_attn_backend # type: ignore -else: - from vllm.v1.attention.selector import get_attn_backend # type: ignore - from vllm.v1.attention.backend import AttentionBackend, AttentionType # type: ignore -# isort: on import torch_npu # if true, allow tensor initialization and casting with internal format (e.g., NZ) @@ -746,10 +739,7 @@ class NPUModelRunner(GPUModelRunner): # _prepare_inputs may reorder the batch, so we must gather # multi-modal outputs after that to ensure the correct order - if vllm_version_is('0.13.0'): - model_kwargs = self._init_model_kwargs(num_input_tokens) - else: - model_kwargs = self._init_model_kwargs() + model_kwargs = self._init_model_kwargs() if self.is_multimodal_model and not self.model_config.is_encoder_decoder: self.multimodal_cpu_fields = ["grid_thw"] self._prepare_multimodal_fields() @@ -1575,16 +1565,10 @@ class NPUModelRunner(GPUModelRunner): logits = None else: if self.input_batch.pooling_params: - if vllm_version_is('0.13.0'): - pool_output = self._pool( - hidden_states, - scheduler_output.total_num_scheduled_tokens, - num_scheduled_tokens_np) - else: - pool_output = self._pool( - hidden_states, - scheduler_output.total_num_scheduled_tokens, - num_scheduled_tokens_np, kv_connector_output) + pool_output = self._pool( + hidden_states, + scheduler_output.total_num_scheduled_tokens, + num_scheduled_tokens_np, kv_connector_output) if self.debugger is not None: self.debugger.stop() self.debugger.step() @@ -1675,8 +1659,7 @@ class NPUModelRunner(GPUModelRunner): attn_metadata, aux_hidden_states, ) - if not vllm_version_is('0.13.0'): - self._copy_draft_token_ids_to_cpu(scheduler_output) + self._copy_draft_token_ids_to_cpu(scheduler_output) ( logprobs_lists, @@ -1826,20 +1809,12 @@ class NPUModelRunner(GPUModelRunner): valid_sampled_token_ids[int(i)].clear() else: # Includes spec decode tokens. - if vllm_version_is('0.13.0'): - valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output( - sampled_token_ids, - self.input_batch.vocab_size, - discard_sampled_tokens_req_indices, - return_cu_num_tokens=logprobs_tensors is not None, - ) - else: - valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output( - sampled_token_ids, - self.input_batch.vocab_size, - discard_sampled_tokens_req_indices, - logprobs_tensors=logprobs_tensors, - ) + valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output( + sampled_token_ids, + self.input_batch.vocab_size, + discard_sampled_tokens_req_indices, + logprobs_tensors=logprobs_tensors, + ) else: valid_sampled_token_ids = [] invalid_req_indices = discard_sampled_tokens_req_indices.tolist() diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 546d849e..f697f928 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -58,16 +58,13 @@ from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel from vllm_ascend.ops.triton.triton_utils import init_device_properties_triton from vllm_ascend.utils import (AscendDeviceType, check_ascend_device_type, enable_sp, get_ascend_device_type, - register_ascend_customop, vllm_version_is) + register_ascend_customop) from vllm_ascend.worker.model_runner_v1 import NPUModelRunner torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402 from torch._dynamo.variables import TorchInGraphFunctionVariable # noqa: E402 -if vllm_version_is("0.13.0"): - from vllm.model_executor.utils import set_random_seed -else: - from vllm.utils.torch_utils import set_random_seed +from vllm.utils.torch_utils import set_random_seed torch_non_c_binding_in_graph_functions_npu = dict.fromkeys( ["torch.npu.current_stream"], @@ -121,13 +118,6 @@ class NPUWorker(WorkerBase): self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ self.cache_config.cache_dtype] - if vllm_version_is('0.13.0'): - if self.model_config.trust_remote_code: - # note: lazy import to avoid importing torch before initializing - from vllm.utils.import_utils import init_cached_hf_modules - - init_cached_hf_modules() - self.profiler = self._init_profiler() if vllm_config.model_config and vllm_config.model_config.enable_sleep_mode: # Buffers saved before sleep