diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index 5f14620a..be6dec25 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -32,7 +32,7 @@ on: description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need vllm_version: required: false - default: "v0.11.0" + default: "2918c1b49c88c29783c86f78d2c4221cb9622379" type: string description: vllm version to use vllm_ascend_remote_url: diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 258c434a..9ff6407b 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -51,7 +51,7 @@ jobs: strategy: matrix: include: - - vllm_branch: v0.11.0 + - vllm_branch: 2918c1b49c88c29783c86f78d2c4221cb9622379 vllm_ascend_branch: main max-parallel: 1 container: diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index ecfa83a6..cffc25b1 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -83,7 +83,7 @@ jobs: VLLM_USE_MODELSCOPE: True strategy: matrix: - vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0] + vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379] steps: - name: Install packages run: | @@ -138,7 +138,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0] + vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml index ec5fb344..366d3ff9 100644 --- a/.github/workflows/vllm_ascend_test_full.yaml +++ b/.github/workflows/vllm_ascend_test_full.yaml @@ -69,7 +69,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0] + vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/vllm_ascend_test_nightly_a2.yaml b/.github/workflows/vllm_ascend_test_nightly_a2.yaml index 4baa3332..2be59996 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a2.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a2.yaml @@ -86,7 +86,7 @@ jobs: tests: tests/e2e/nightly/ops uses: ./.github/workflows/_e2e_nightly_single_node.yaml with: - vllm: v0.11.0 + vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379 runner: ${{ matrix.test_config.os }} tests: ${{ matrix.test_config.tests }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' @@ -125,7 +125,7 @@ jobs: - Qwen3-Next-80B-A3B-Instruct uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml with: - vllm: v0.11.0 + vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379 runner: ${{ matrix.test_config.os }} model_list: ${{ toJson(matrix.test_config.model_list) }} image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 diff --git a/.github/workflows/vllm_ascend_test_nightly_a3.yaml b/.github/workflows/vllm_ascend_test_nightly_a3.yaml index f425ce3c..062aaac1 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a3.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml @@ -136,7 +136,7 @@ jobs: tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py uses: ./.github/workflows/_e2e_nightly_single_node.yaml with: - vllm: v0.11.0 + vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379 runner: ${{ matrix.test_config.os }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' tests: ${{ matrix.test_config.tests }} diff --git a/.github/workflows/vllm_ascend_test_report.yaml b/.github/workflows/vllm_ascend_test_report.yaml index ec681f4a..9be40eb2 100644 --- a/.github/workflows/vllm_ascend_test_report.yaml +++ b/.github/workflows/vllm_ascend_test_report.yaml @@ -72,7 +72,7 @@ jobs: - DeepSeek-V2-Lite uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml with: - vllm: v0.11.0 + vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379 runner: ${{ matrix.runner }} image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 model_list: ${{ toJson(matrix.model_list) }} diff --git a/Dockerfile b/Dockerfile index 72158475..ca677bf5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,8 +46,10 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.11.0 -RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379 +# Revert this change once VLLM_TAG is specified to branch or tag +# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG) # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.310p b/Dockerfile.310p index 3cbe02e6..acd8061e 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -37,8 +37,10 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.11.0 -RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379 +# Revert this change once VLLM_TAG is specified to branch or tag +# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG) # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index 0d14fd52..1f575ec2 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -34,9 +34,10 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.11.0 - -RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379 +# Revert this change once VLLM_TAG is specified to branch or tag +# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG) # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.a3 b/Dockerfile.a3 index 97dd6f1d..ef04c274 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -45,8 +45,10 @@ RUN apt-get update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.11.0 -RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379 +# Revert this change once VLLM_TAG is specified to branch or tag +# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG) # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index 0e92b0f7..003f6ac5 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -48,9 +48,10 @@ RUN yum update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.11.0 - -RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379 +# Revert this change once VLLM_TAG is specified to branch or tag +# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG) # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index cca33174..3b5436d0 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -48,9 +48,10 @@ RUN yum update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.11.0 - -RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +ARG VLLM_TAG=2918c1b49c88c29783c86f78d2c4221cb9622379 +# Revert this change once VLLM_TAG is specified to branch or tag +# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG) # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/examples/offline_data_parallel.py b/examples/offline_data_parallel.py index 62ef99bf..b16d50ff 100644 --- a/examples/offline_data_parallel.py +++ b/examples/offline_data_parallel.py @@ -63,11 +63,7 @@ import torch from vllm import LLM, SamplingParams from vllm.distributed.parallel_state import ( # noqa E402 destroy_distributed_environment, destroy_model_parallel) -from vllm_ascend.utils import vllm_version_is -if vllm_version_is("0.11.0"): - from vllm.utils import get_open_port -else: - from vllm.utils.network_utils import get_open_port +from vllm.utils.network_utils import get_open_port os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/examples/offline_external_launcher.py b/examples/offline_external_launcher.py index 35d5fcfd..a015fd6f 100644 --- a/examples/offline_external_launcher.py +++ b/examples/offline_external_launcher.py @@ -67,13 +67,8 @@ from vllm import LLM, SamplingParams from vllm.distributed.parallel_state import ( # noqa E402 destroy_distributed_environment, destroy_model_parallel, get_tp_group) from safetensors.torch import load_file -from vllm_ascend.utils import vllm_version_is -if vllm_version_is("0.11.0"): - from vllm.utils import GiB_bytes, get_open_port - -else: - from vllm.utils.mem_constants import GiB_bytes - from vllm.utils.network_utils import get_open_port +from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.network_utils import get_open_port os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/examples/offline_inference_sleep_mode_npu.py b/examples/offline_inference_sleep_mode_npu.py index 53c59357..9f3b0917 100644 --- a/examples/offline_inference_sleep_mode_npu.py +++ b/examples/offline_inference_sleep_mode_npu.py @@ -20,11 +20,7 @@ import os import torch from vllm import LLM, SamplingParams -from vllm_ascend.utils import vllm_version_is -if vllm_version_is("0.11.0"): - from vllm.utils import GiB_bytes -else: - from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.mem_constants import GiB_bytes os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/examples/offline_weight_load.py b/examples/offline_weight_load.py index c24ac3f5..6d970f37 100644 --- a/examples/offline_weight_load.py +++ b/examples/offline_weight_load.py @@ -67,13 +67,8 @@ from vllm import LLM, SamplingParams from vllm.distributed.parallel_state import ( # noqa E402 destroy_distributed_environment, destroy_model_parallel, get_tp_group) from safetensors.torch import load_file -from vllm_ascend.utils import vllm_version_is -if vllm_version_is("0.11.0"): - from vllm.utils import GiB_bytes, get_open_port - -else: - from vllm.utils.mem_constants import GiB_bytes - from vllm.utils.network_utils import get_open_port +from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.network_utils import get_open_port os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index dc68bd12..4d2c8c5f 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -45,6 +45,7 @@ from vllm.inputs import TextPrompt from vllm.outputs import RequestOutput from vllm.platforms import current_platform from vllm.transformers_utils.utils import maybe_model_redirect +from vllm.utils.network_utils import get_open_port from tests.e2e.model_utils import (TokensTextLogprobs, TokensTextLogprobsPromptLogprobs) @@ -54,12 +55,6 @@ from vllm_ascend.ascend_config import clear_ascend_config # we not explicitly patch here, some of them might be effectiveless # in pytest scenario from vllm_ascend.utils import adapt_patch # noqa E402 -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import get_open_port -else: - from vllm.utils.network_utils import get_open_port adapt_patch(True) adapt_patch(False) diff --git a/tests/e2e/multicard/test_aclgraph_capture_replay.py b/tests/e2e/multicard/test_aclgraph_capture_replay.py index d8c0e2ee..ff4777a1 100644 --- a/tests/e2e/multicard/test_aclgraph_capture_replay.py +++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py @@ -23,13 +23,7 @@ from unittest.mock import patch import pytest import torch - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import get_open_port -else: - from vllm.utils.network_utils import get_open_port +from vllm.utils.network_utils import get_open_port MODELS = [ "Qwen/Qwen3-0.6B", diff --git a/tests/e2e/multicard/test_single_request_aclgraph.py b/tests/e2e/multicard/test_single_request_aclgraph.py index 5172f72a..9e9c1c77 100644 --- a/tests/e2e/multicard/test_single_request_aclgraph.py +++ b/tests/e2e/multicard/test_single_request_aclgraph.py @@ -19,14 +19,9 @@ from typing import Any import openai import pytest +from vllm.utils.network_utils import get_open_port from tests.e2e.conftest import RemoteOpenAIServer -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import get_open_port -else: - from vllm.utils.network_utils import get_open_port MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"] diff --git a/tests/e2e/nightly/models/test_qwen3_32b.py b/tests/e2e/nightly/models/test_qwen3_32b.py index 267d56f9..9a358e64 100644 --- a/tests/e2e/nightly/models/test_qwen3_32b.py +++ b/tests/e2e/nightly/models/test_qwen3_32b.py @@ -18,15 +18,10 @@ from typing import Any import openai import pytest +from vllm.utils.network_utils import get_open_port from tests.e2e.conftest import RemoteOpenAIServer from tools.aisbench import run_aisbench_cases -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import get_open_port -else: - from vllm.utils.network_utils import get_open_port MODELS = [ "Qwen/Qwen3-32B", diff --git a/tests/e2e/singlecard/multi-modal/test_internvl.py b/tests/e2e/singlecard/multi-modal/test_internvl.py index 7cf32e4d..ac60a75c 100644 --- a/tests/e2e/singlecard/multi-modal/test_internvl.py +++ b/tests/e2e/singlecard/multi-modal/test_internvl.py @@ -25,7 +25,6 @@ from vllm.assets.image import ImageAsset from tests.e2e.conftest import VllmRunner from tests.e2e.model_utils import check_outputs_equal -from vllm_ascend.utils import vllm_version_is MODELS = [ "OpenGVLab/InternVL2-8B", @@ -34,13 +33,6 @@ MODELS = [ "OpenGVLab/InternVL3_5-8B", ] -# skip testing InternVL3-8B and InternVL3_5-8B on 0.11.0 due to https://github.com/vllm-project/vllm-ascend/issues/3925. -if vllm_version_is("0.11.0"): - MODELS = [ - "OpenGVLab/InternVL2-8B", - "OpenGVLab/InternVL2_5-8B", - ] - @pytest.mark.parametrize("model", MODELS) def test_internvl_basic(model: str): diff --git a/tests/e2e/singlecard/test_camem.py b/tests/e2e/singlecard/test_camem.py index 2fe4a855..cdf7527e 100644 --- a/tests/e2e/singlecard/test_camem.py +++ b/tests/e2e/singlecard/test_camem.py @@ -23,16 +23,11 @@ from unittest.mock import patch import torch from vllm import SamplingParams +from vllm.utils.mem_constants import GiB_bytes from tests.e2e.conftest import VllmRunner from tests.e2e.utils import fork_new_process_for_each_test from vllm_ascend.device_allocator.camem import CaMemAllocator -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import GiB_bytes -else: - from vllm.utils.mem_constants import GiB_bytes @fork_new_process_for_each_test diff --git a/tests/ut/core/test_scheduler.py b/tests/ut/core/test_scheduler.py index ac8bff8a..98da1ad8 100644 --- a/tests/ut/core/test_scheduler.py +++ b/tests/ut/core/test_scheduler.py @@ -9,6 +9,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, from vllm.multimodal.inputs import (MultiModalFeatureSpec, MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams +from vllm.utils.hashing import sha256 from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, init_none_hash) from vllm.v1.core.sched.output import SchedulerOutput @@ -21,12 +22,6 @@ from vllm.v1.structured_output import StructuredOutputManager from tests.ut.base import TestBase from vllm_ascend.core.scheduler import AscendScheduler from vllm_ascend.core.scheduler_dynamic_batch import SchedulerDynamicBatch -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import sha256 -else: - from vllm.utils.hashing import sha256 EOS_TOKEN_ID = 50256 MODEL = "Qwen3-0.6B" @@ -181,23 +176,13 @@ class TestAscendScheduler(TestBase): ) cache_config.num_gpu_blocks = 10000 - if vllm_version_is("0.11.0"): - scheduler = AscendScheduler( - vllm_config=vllm_config, - kv_cache_config=kv_cache_config, - log_stats=True, - structured_output_manager=MagicMock( - spec=StructuredOutputManager), - ) - else: - scheduler = AscendScheduler( - vllm_config=vllm_config, - kv_cache_config=kv_cache_config, - log_stats=True, - block_size=block_size, - structured_output_manager=MagicMock( - spec=StructuredOutputManager), - ) + scheduler = AscendScheduler( + vllm_config=vllm_config, + kv_cache_config=kv_cache_config, + log_stats=True, + block_size=block_size, + structured_output_manager=MagicMock(spec=StructuredOutputManager), + ) should_advance = MagicMock() should_advance.return_value = False diff --git a/tests/ut/kv_connector/test_mooncake_connector.py b/tests/ut/kv_connector/test_mooncake_connector.py index 20ae60f0..8d21a02b 100644 --- a/tests/ut/kv_connector/test_mooncake_connector.py +++ b/tests/ut/kv_connector/test_mooncake_connector.py @@ -13,13 +13,7 @@ from unittest.mock import MagicMock, patch import msgspec import zmq from vllm.distributed.parallel_state import GroupCoordinator - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import make_zmq_path -else: - from vllm.utils.network_utils import make_zmq_path +from vllm.utils.network_utils import make_zmq_path fake_engine = types.ModuleType("mooncake.engine") fake_engine.TransferEngine = MagicMock() # type: ignore[attr-defined] diff --git a/tests/ut/kv_connector/utils.py b/tests/ut/kv_connector/utils.py index 389b5044..ab4af6a7 100644 --- a/tests/ut/kv_connector/utils.py +++ b/tests/ut/kv_connector/utils.py @@ -10,6 +10,7 @@ import torch from vllm import SamplingParams from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig, ModelConfig, SchedulerConfig, VllmConfig) +from vllm.utils.hashing import sha256 from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, init_none_hash) from vllm.v1.core.sched.scheduler import Scheduler @@ -19,13 +20,6 @@ from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request from vllm.v1.structured_output import StructuredOutputManager -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import sha256 -else: - from vllm.utils.hashing import sha256 - EOS_TOKEN_ID = 50256 @@ -111,21 +105,14 @@ def create_scheduler( ], ) vllm_config.cache_config.num_gpu_blocks = num_blocks - if vllm_version_is("0.11.0"): - return Scheduler( - vllm_config=vllm_config, - kv_cache_config=kv_cache_config, - log_stats=True, - structured_output_manager=StructuredOutputManager(vllm_config), - ) - else: - return Scheduler( - vllm_config=vllm_config, - kv_cache_config=kv_cache_config, - log_stats=True, - block_size=block_size, - structured_output_manager=StructuredOutputManager(vllm_config), - ) + + return Scheduler( + vllm_config=vllm_config, + kv_cache_config=kv_cache_config, + log_stats=True, + block_size=block_size, + structured_output_manager=StructuredOutputManager(vllm_config), + ) _none_hash_initialized = False diff --git a/tests/ut/model_loader/netloader/test_netloader.py b/tests/ut/model_loader/netloader/test_netloader.py index 9e0fd18c..64d95efe 100644 --- a/tests/ut/model_loader/netloader/test_netloader.py +++ b/tests/ut/model_loader/netloader/test_netloader.py @@ -22,7 +22,6 @@ import torch from torch import nn from vllm_ascend.model_loader.netloader.netloader import ModelNetLoaderElastic -from vllm_ascend.utils import vllm_version_is class DummyDeviceConfig: @@ -174,11 +173,7 @@ def test_load_model_elastic_success(mock_logger, monkeypatch, tmp_path): "vllm_ascend.model_loader.netloader.netloader.process_weights_after_loading", lambda *a, **k: None) # patch get_ip - if vllm_version_is("0.11.0"): - monkeypatch.setattr("vllm.utils.get_ip", lambda: "127.0.0.1") - else: - monkeypatch.setattr("vllm.utils.network_utils.get_ip", - lambda: "127.0.0.1") + monkeypatch.setattr("vllm.utils.network_utils.get_ip", lambda: "127.0.0.1") # patch find_free_port monkeypatch.setattr( "vllm_ascend.model_loader.netloader.netloader.find_free_port", diff --git a/tests/ut/models/test_mla.py b/tests/ut/models/test_mla.py index 87fedc22..6b03b05b 100644 --- a/tests/ut/models/test_mla.py +++ b/tests/ut/models/test_mla.py @@ -9,7 +9,6 @@ from vllm.model_executor.layers.mla import MLAModules from tests.ut.base import TestBase from vllm_ascend.models.layers.mla import (AscendMultiHeadLatentAttention, IndexerWrapper) -from vllm_ascend.utils import vllm_version_is class TestIndexerWrapper(TestBase): @@ -85,68 +84,35 @@ class TestAscendMultiHeadLatentAttention(TestBase): "vllm_ascend.models.layers.mla.get_tensor_model_parallel_world_size") def test_initialization(self, mock_tp_size, mock_ascend_config, mock_get_vllm_config): - if vllm_version_is("0.11.0"): - with patch("vllm_ascend.models.layers.mla.Attention", - return_value=True): - mock_tp_size.return_value = 1 - mock_ascend_config.return_value.enable_shared_expert_dp = False - mock_vllm_config = MagicMock(spec=VllmConfig) - mock_vllm_config.model_config.hf_config = MagicMock( - num_hidden_layers=32, first_k_dense_replace=False) - mock_get_vllm_config.return_value = mock_vllm_config - mock_vllm_config.compilation_config = CompilationConfig() - attn = AscendMultiHeadLatentAttention( - hidden_size=self.hidden_size, - num_heads=self.num_heads, - scale=self.scale, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - v_head_dim=self.v_head_dim, - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - mla_modules=self.mock_mla_modules, - cache_config=self.mock_cache_config, - quant_config=self.mock_quant_config, - prefix=self.prefix, - ) + with patch("vllm_ascend.models.layers.mla.MLAAttention", + return_value=True): + mock_tp_size.return_value = 2 + mock_ascend_config.return_value.enable_shared_expert_dp = True + mock_vllm_config = MagicMock(spec=VllmConfig) + mock_vllm_config.model_config.hf_config = MagicMock( + num_hidden_layers=32, first_k_dense_replace=True) + mock_get_vllm_config.return_value = mock_vllm_config + mock_vllm_config.compilation_config = CompilationConfig() - self.assertEqual(attn.hidden_size, self.hidden_size) - self.assertEqual(attn.kv_lora_rank, self.kv_lora_rank) - self.assertEqual(attn.debug_layer_idx, 0) - self.assertIsNotNone(attn.mla_attn) - self.assertIn( - self.prefix, - mock_vllm_config.compilation_config.static_forward_context) - else: - with patch("vllm_ascend.models.layers.mla.MLAAttention", - return_value=True): - mock_tp_size.return_value = 2 - mock_ascend_config.return_value.enable_shared_expert_dp = True - mock_vllm_config = MagicMock(spec=VllmConfig) - mock_vllm_config.model_config.hf_config = MagicMock( - num_hidden_layers=32, first_k_dense_replace=True) - mock_get_vllm_config.return_value = mock_vllm_config - mock_vllm_config.compilation_config = CompilationConfig() + attn = AscendMultiHeadLatentAttention( + hidden_size=self.hidden_size, + num_heads=self.num_heads, + scale=self.scale, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + v_head_dim=self.v_head_dim, + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + mla_modules=self.mock_mla_modules, + cache_config=self.mock_cache_config, + quant_config=self.mock_quant_config, + prefix=self.prefix, + ) - attn = AscendMultiHeadLatentAttention( - hidden_size=self.hidden_size, - num_heads=self.num_heads, - scale=self.scale, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - v_head_dim=self.v_head_dim, - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - mla_modules=self.mock_mla_modules, - cache_config=self.mock_cache_config, - quant_config=self.mock_quant_config, - prefix=self.prefix, - ) - - self.assertEqual(attn.tp_size, 2) - self.assertTrue(attn.enable_shared_expert_dp) - self.assertIsNotNone(attn.mla_attn) + self.assertEqual(attn.tp_size, 2) + self.assertTrue(attn.enable_shared_expert_dp) + self.assertIsNotNone(attn.mla_attn) @patch("vllm_ascend.models.layers.mla.torch.ops.vllm.mla_forward") @patch("vllm_ascend.models.layers.mla.get_current_vllm_config") @@ -164,41 +130,22 @@ class TestAscendMultiHeadLatentAttention(TestBase): num_hidden_layers=32, first_k_dense_replace=False) mock_get_vllm_config.return_value = mock_vllm_config mock_vllm_config.compilation_config = CompilationConfig() - - if vllm_version_is("0.11.0"): - with patch("vllm_ascend.models.layers.mla.Attention", - return_value=True): - attn = AscendMultiHeadLatentAttention( - hidden_size=self.hidden_size, - num_heads=self.num_heads, - scale=self.scale, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - v_head_dim=self.v_head_dim, - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - mla_modules=self.mock_mla_modules, - cache_config=self.mock_cache_config, - quant_config=self.mock_quant_config, - prefix=self.prefix, - ) - else: - with patch("vllm_ascend.models.layers.mla.MLAAttention", - return_value=True): - attn = AscendMultiHeadLatentAttention( - hidden_size=self.hidden_size, - num_heads=self.num_heads, - scale=self.scale, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - v_head_dim=self.v_head_dim, - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - mla_modules=self.mock_mla_modules, - cache_config=self.mock_cache_config, - quant_config=self.mock_quant_config, - prefix=self.prefix, - ) + with patch("vllm_ascend.models.layers.mla.MLAAttention", + return_value=True): + attn = AscendMultiHeadLatentAttention( + hidden_size=self.hidden_size, + num_heads=self.num_heads, + scale=self.scale, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + v_head_dim=self.v_head_dim, + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + mla_modules=self.mock_mla_modules, + cache_config=self.mock_cache_config, + quant_config=self.mock_quant_config, + prefix=self.prefix, + ) positions = torch.tensor([0, 1, 2]) hidden_states = torch.randn(3, self.hidden_size) diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index f7a6cbd1..bf3f8e0b 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -3,18 +3,13 @@ from unittest.mock import MagicMock, patch import pytest import torch -from vllm.config.compilation import CUDAGraphMode +from vllm.config.compilation import CompilationMode, CUDAGraphMode from vllm.engine.arg_utils import EngineArgs from vllm.platforms import PlatformEnum from tests.ut.base import TestBase from vllm_ascend.platform import NPUPlatform -from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.config.compilation import CompilationLevel -else: - from vllm.config.compilation import CompilationMode +from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD class TestNPUPlatform(TestBase): @@ -313,16 +308,10 @@ class TestNPUPlatform(TestBase): self.assertTrue("Compilation disabled, using eager mode by default" in cm.output[0]) - if vllm_version_is("0.11.0"): - self.assertEqual( - vllm_config.compilation_config.level, - CompilationLevel.NO_COMPILATION, - ) - else: - self.assertEqual( - vllm_config.compilation_config.mode, - CompilationMode.NONE, - ) + self.assertEqual( + vllm_config.compilation_config.mode, + CompilationMode.NONE, + ) self.assertEqual( vllm_config.compilation_config.cudagraph_mode, @@ -348,10 +337,7 @@ class TestNPUPlatform(TestBase): mock_init_recompute.return_value = MagicMock() vllm_config.scheduler_config = MagicMock() - if vllm_version_is("0.11.0"): - vllm_config.compilation_config.level = CompilationLevel.DYNAMO_ONCE - else: - vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE + vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE with self.assertLogs(logger="vllm", level="WARNING") as cm: from vllm_ascend import platform @@ -359,16 +345,11 @@ class TestNPUPlatform(TestBase): importlib.reload(platform) self.platform.check_and_update_config(vllm_config) self.assertTrue("NPU does not support" in cm.output[0]) - if vllm_version_is("0.11.0"): - self.assertEqual( - vllm_config.compilation_config.level, - CompilationLevel.NO_COMPILATION, - ) - else: - self.assertEqual( - vllm_config.compilation_config.mode, - CompilationMode.NONE, - ) + + self.assertEqual( + vllm_config.compilation_config.mode, + CompilationMode.NONE, + ) self.assertEqual( vllm_config.compilation_config.cudagraph_mode, CUDAGraphMode.NONE, @@ -396,16 +377,10 @@ class TestNPUPlatform(TestBase): "cudagraph_mode is not support on NPU. falling back to NONE" in cm.output[0]) - if vllm_version_is("0.11.0"): - self.assertEqual( - vllm_config.compilation_config.level, - CompilationLevel.NO_COMPILATION, - ) - else: - self.assertEqual( - vllm_config.compilation_config.mode, - CompilationMode.NONE, - ) + self.assertEqual( + vllm_config.compilation_config.mode, + CompilationMode.NONE, + ) self.assertEqual( vllm_config.compilation_config.cudagraph_mode, CUDAGraphMode.NONE, @@ -431,10 +406,7 @@ class TestNPUPlatform(TestBase): mock_init_recompute.return_value = MagicMock() vllm_config.scheduler_config = MagicMock() - if vllm_version_is("0.11.0"): - vllm_config.compilation_config.level = CompilationLevel.PIECEWISE - else: - vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE + vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE with self.assertLogs(logger="vllm", level="INFO") as cm: from vllm_ascend import platform @@ -443,16 +415,10 @@ class TestNPUPlatform(TestBase): self.platform.check_and_update_config(vllm_config) self.assertTrue("Torchair compilation enabled" in cm.output[0]) - if vllm_version_is("0.11.0"): - self.assertEqual( - vllm_config.compilation_config.level, - CompilationLevel.NO_COMPILATION, - ) - else: - self.assertEqual( - vllm_config.compilation_config.mode, - CompilationMode.NONE, - ) + self.assertEqual( + vllm_config.compilation_config.mode, + CompilationMode.NONE, + ) self.assertEqual( vllm_config.compilation_config.cudagraph_mode, CUDAGraphMode.NONE, @@ -658,12 +624,9 @@ class TestNPUPlatform(TestBase): def test_get_punica_wrapper(self): result = self.platform.get_punica_wrapper() - if vllm_version_is("0.11.0"): - self.assertEqual( - result, "vllm_ascend.lora.punica_npu.PunicaWrapperNPU0110") - else: - self.assertEqual(result, - "vllm_ascend.lora.punica_npu.PunicaWrapperNPU") + + self.assertEqual(result, + "vllm_ascend.lora.punica_npu.PunicaWrapperNPU") @patch("torch.npu.reset_peak_memory_stats") @patch("torch.npu.max_memory_allocated") @@ -742,16 +705,11 @@ class TestNPUPlatform(TestBase): self.assertTrue( "PIECEWISE compilation enabled on NPU. use_inductor not supported - " "using only ACL Graph mode" in cm.output[0]) - if vllm_version_is("0.11.0"): - self.assertEqual( - VllmConfig.compilation_config.level, - CompilationLevel.PIECEWISE, - ) - else: - self.assertEqual( - VllmConfig.compilation_config.mode, - CompilationMode.VLLM_COMPILE, - ) + + self.assertEqual( + VllmConfig.compilation_config.mode, + CompilationMode.VLLM_COMPILE, + ) self.assertEqual( VllmConfig.compilation_config.cudagraph_mode, CUDAGraphMode.PIECEWISE, diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py index 147e8378..d5b87130 100644 --- a/tests/ut/test_utils.py +++ b/tests/ut/test_utils.py @@ -274,46 +274,8 @@ class TestUtils(TestBase): utils.update_aclgraph_sizes(test_vllm_config) del os.environ['HCCL_OP_EXPANSION_MODE'] - if utils.vllm_version_is("0.11.0"): - self.assertEqual( - 137, - len(test_vllm_config.compilation_config.cudagraph_capture_sizes - )) - else: - self.assertEqual( - 0, - len(test_vllm_config.compilation_config.cudagraph_capture_sizes - )) - return - - test_vllm_config.speculative_config = mock.MagicMock() - test_vllm_config.speculative_config.num_speculative_tokens = 2 - test_vllm_config.speculative_config.draft_model_config = mock.MagicMock( - ) - test_vllm_config.speculative_config.draft_model_config.hf_config = mock.MagicMock( - ) - test_vllm_config.speculative_config.draft_model_config.hf_config.num_hidden_layers = 2 - os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV' - utils.update_aclgraph_sizes(test_vllm_config) - del os.environ['HCCL_OP_EXPANSION_MODE'] self.assertEqual( - 111, - len(test_vllm_config.compilation_config.cudagraph_capture_sizes)) - - # max_num_batch_sizes >= len(original_sizes) - test_compilation_config = CompilationConfig( - cudagraph_capture_sizes=[1, 2, 3]) - test_vllm_config = VllmConfig( - model_config=test_model_config, - compilation_config=test_compilation_config, - parallel_config=test_parallel_config, - ) - utils.update_aclgraph_sizes(test_vllm_config) - os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV' - utils.update_aclgraph_sizes(test_vllm_config) - del os.environ['HCCL_OP_EXPANSION_MODE'] - self.assertEqual( - 3, + 0, len(test_vllm_config.compilation_config.cudagraph_capture_sizes)) @mock.patch("vllm.model_executor.custom_op.CustomOp") diff --git a/tests/ut/torchair/test_torchair_mtp_proposer.py b/tests/ut/torchair/test_torchair_mtp_proposer.py index fdafce3f..50745226 100644 --- a/tests/ut/torchair/test_torchair_mtp_proposer.py +++ b/tests/ut/torchair/test_torchair_mtp_proposer.py @@ -7,7 +7,6 @@ from vllm.config import CacheConfig, VllmConfig from tests.ut.base import PytestBase from vllm_ascend.torchair.torchair_mtp_proposer import TorchairMtpProposer -from vllm_ascend.utils import vllm_version_is class TestTorchairMtpProposer(PytestBase): @@ -40,14 +39,8 @@ class TestTorchairMtpProposer(PytestBase): mocker.patch( "vllm_ascend.torchair.torchair_mtp_proposer.MtpProposer.__init__", return_value=None) - - if vllm_version_is("0.11.0"): - mock_set_default_dtype = mocker.patch( - 'vllm.model_executor.model_loader.utils.set_default_torch_dtype' - ) - else: - mock_set_default_dtype = mocker.patch( - 'vllm.utils.torch_utils.set_default_torch_dtype') + mock_set_default_dtype = mocker.patch( + 'vllm.utils.torch_utils.set_default_torch_dtype') mock_set_default_dtype.return_value.__enter__.return_value = None mock_model_loader = MagicMock() diff --git a/tests/ut/torchair/test_torchair_worker.py b/tests/ut/torchair/test_torchair_worker.py index 74a85179..51d139fd 100644 --- a/tests/ut/torchair/test_torchair_worker.py +++ b/tests/ut/torchair/test_torchair_worker.py @@ -4,10 +4,8 @@ import torch from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from tests.ut.base import TestBase -from vllm_ascend.utils import vllm_version_is -init_cache_hf_modules_path = "vllm.utils.init_cached_hf_modules" if vllm_version_is( - "0.11.0") else "vllm.utils.import_utils.init_cached_hf_modules" +init_cache_hf_modules_path = "vllm.utils.import_utils.init_cached_hf_modules" class TestNPUTorchairWorker(TestBase): diff --git a/tests/ut/worker/test_input_batch.py b/tests/ut/worker/test_input_batch.py index cdff8e07..15c323b2 100644 --- a/tests/ut/worker/test_input_batch.py +++ b/tests/ut/worker/test_input_batch.py @@ -20,19 +20,14 @@ import numpy as np import pytest import torch from vllm.sampling_params import SamplingParams +from vllm.utils.torch_utils import make_tensor_with_pad from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata -from vllm_ascend.utils import vllm_version_is from vllm_ascend.worker.block_table import BlockTable, MultiGroupBlockTable from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch -if vllm_version_is("0.11.0"): - from vllm.utils import make_tensor_with_pad -else: - from vllm.utils.torch_utils import make_tensor_with_pad - VOCAB_SIZE = 1024 NUM_OUTPUT_TOKENS = 20 MAX_PROMPT_SIZE = 100 diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py index 48a4242a..9aa9a095 100644 --- a/tests/ut/worker/test_worker_v1.py +++ b/tests/ut/worker/test_worker_v1.py @@ -6,10 +6,8 @@ import torch from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from tests.ut.base import TestBase -from vllm_ascend.utils import vllm_version_is -init_cached_hf_modules_path = "vllm.utils.init_cached_hf_modules" if vllm_version_is( - "0.11.0") else "vllm.utils.import_utils.init_cached_hf_modules" +init_cached_hf_modules_path = "vllm.utils.import_utils.init_cached_hf_modules" class TestNPUWorker(TestBase): @@ -189,26 +187,15 @@ class TestNPUWorker(TestBase): # Create NPUWorker instance from vllm_ascend.worker.worker_v1 import NPUWorker - if vllm_version_is("0.11.0"): - with patch("vllm.utils.STR_DTYPE_TO_TORCH_DTYPE", - {"float32": torch.float32}): - worker = NPUWorker( - vllm_config=self.vllm_config_mock, - local_rank=self.local_rank, - rank=self.rank, - distributed_init_method=self.distributed_init_method, - is_driver_worker=self.is_driver_worker, - ) - else: - with patch("vllm.utils.torch_utils.STR_DTYPE_TO_TORCH_DTYPE", - {"float32": torch.float32}): - worker = NPUWorker( - vllm_config=self.vllm_config_mock, - local_rank=self.local_rank, - rank=self.rank, - distributed_init_method=self.distributed_init_method, - is_driver_worker=self.is_driver_worker, - ) + with patch("vllm.utils.torch_utils.STR_DTYPE_TO_TORCH_DTYPE", + {"float32": torch.float32}): + worker = NPUWorker( + vllm_config=self.vllm_config_mock, + local_rank=self.local_rank, + rank=self.rank, + distributed_init_method=self.distributed_init_method, + is_driver_worker=self.is_driver_worker, + ) # Verify cache_dtype is set to custom value self.assertEqual(worker.cache_dtype, torch.float32) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 8362e195..846f68db 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -31,14 +31,7 @@ from vllm.distributed import (get_dcp_group, get_decode_context_model_parallel_rank, get_decode_context_model_parallel_world_size) from vllm.forward_context import ForwardContext, get_forward_context - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import cdiv -else: - from vllm.utils.math_utils import cdiv - +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import AttentionSpec diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 5506b185..188e66a5 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -20,14 +20,7 @@ from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import logger from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import cdiv, round_down -else: - from vllm.utils.math_utils import cdiv, round_down - +from vllm.utils.math_utils import cdiv, round_down from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm_ascend import envs diff --git a/vllm_ascend/core/recompute_scheduler.py b/vllm_ascend/core/recompute_scheduler.py index 14a5d273..d04f8f85 100644 --- a/vllm_ascend/core/recompute_scheduler.py +++ b/vllm_ascend/core/recompute_scheduler.py @@ -55,8 +55,6 @@ from vllm.v1.spec_decode.metrics import SpecDecodingStats from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.utils import ConstantList -from vllm_ascend.utils import vllm_version_is - class RecomputeScheduler(SchedulerInterface): """This Scheduler extends vllm's original v1 scheduler of version 0.11 @@ -587,14 +585,9 @@ class RecomputeScheduler(SchedulerInterface): self.kv_cache_config.kv_cache_groups) if self.running: any_request = self.running[0] - if vllm_version_is("0.11.0"): - num_common_prefix_blocks = ( - self.kv_cache_manager.get_num_common_prefix_blocks( - any_request, len(self.running))) - else: - num_common_prefix_blocks = ( - self.kv_cache_manager.get_num_common_prefix_blocks( - any_request.request_id)) + num_common_prefix_blocks = ( + self.kv_cache_manager.get_num_common_prefix_blocks( + any_request.request_id)) # Construct the scheduler output. new_reqs_data = [ diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index 5f02567f..3f8e8d55 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -22,14 +22,7 @@ from vllm.config import VllmConfig from vllm.distributed.kv_events import KVEventBatch from vllm.logger import logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import cdiv -else: - from vllm.utils.math_utils import cdiv - +from vllm.utils.math_utils import cdiv from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler @@ -39,8 +32,6 @@ from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager -from vllm_ascend.utils import vllm_version_is - class AscendScheduler(Scheduler): """This Scheduler extends vllm's original v1 scheduler @@ -71,14 +62,9 @@ class AscendScheduler(Scheduler): log_stats: bool = False, ) -> None: # Call the parent class's __init__ method - if vllm_version_is("0.11.0"): - super().__init__(vllm_config, kv_cache_config, - structured_output_manager, mm_registry, - include_finished_set, log_stats) - else: - super().__init__(vllm_config, kv_cache_config, - structured_output_manager, block_size, - mm_registry, include_finished_set, log_stats) + super().__init__(vllm_config, kv_cache_config, + structured_output_manager, block_size, mm_registry, + include_finished_set, log_stats) # Initialize common attributes self._initialize_common() @@ -462,14 +448,9 @@ class AscendScheduler(Scheduler): self.kv_cache_config.kv_cache_groups) if self.running: any_request = self.running[0] - if vllm_version_is("0.11.0"): - num_common_prefix_blocks = ( - self.kv_cache_manager.get_num_common_prefix_blocks( - any_request, len(self.running))) - else: - num_common_prefix_blocks = ( - self.kv_cache_manager.get_num_common_prefix_blocks( - any_request.request_id)) + num_common_prefix_blocks = ( + self.kv_cache_manager.get_num_common_prefix_blocks( + any_request.request_id)) # Construct the scheduler output. new_reqs_data = [ diff --git a/vllm_ascend/core/scheduler_dynamic_batch.py b/vllm_ascend/core/scheduler_dynamic_batch.py index 6e984a22..8f635614 100644 --- a/vllm_ascend/core/scheduler_dynamic_batch.py +++ b/vllm_ascend/core/scheduler_dynamic_batch.py @@ -33,8 +33,6 @@ from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager -from vllm_ascend.utils import vllm_version_is - class BudgetRefiner: """This budget refiner can make dynamic adjustment to the token budget @@ -130,14 +128,9 @@ class SchedulerDynamicBatch(Scheduler): include_finished_set: bool = False, log_stats: bool = False, ) -> None: - if vllm_version_is("0.11.0"): - super().__init__(vllm_config, kv_cache_config, - structured_output_manager, mm_registry, - include_finished_set, log_stats) - else: - super().__init__(vllm_config, kv_cache_config, - structured_output_manager, block_size, - mm_registry, include_finished_set, log_stats) + super().__init__(vllm_config, kv_cache_config, + structured_output_manager, block_size, mm_registry, + include_finished_set, log_stats) self.running: list[Request] = [] self.budget_refiner = BudgetRefiner( default_budget=self.scheduler_config.max_num_batched_tokens, @@ -540,14 +533,9 @@ class SchedulerDynamicBatch(Scheduler): self.kv_cache_config.kv_cache_groups) if self.running: any_request = self.running[0] - if vllm_version_is("0.11.0"): - num_common_prefix_blocks = ( - self.kv_cache_manager.get_num_common_prefix_blocks( - any_request, len(self.running))) - else: - num_common_prefix_blocks = ( - self.kv_cache_manager.get_num_common_prefix_blocks( - any_request.request_id)) + num_common_prefix_blocks = ( + self.kv_cache_manager.get_num_common_prefix_blocks( + any_request.request_id)) # Construct the scheduler output. new_reqs_data = [ NewRequestData.from_request( diff --git a/vllm_ascend/distributed/cpu_offload_manager/metadata.py b/vllm_ascend/distributed/cpu_offload_manager/metadata.py index 7f07a624..b89659e2 100644 --- a/vllm_ascend/distributed/cpu_offload_manager/metadata.py +++ b/vllm_ascend/distributed/cpu_offload_manager/metadata.py @@ -10,17 +10,12 @@ import vllm.envs as envs import zmq from vllm.config import KVTransferConfig, VllmConfig from vllm.utils import logger +from vllm.utils.network_utils import make_zmq_socket +from vllm.utils.torch_utils import get_dtype_size from vllm.v1.kv_cache_interface import AttentionSpec from vllm_ascend.distributed.cpu_offload_manager.cpu_kv_cache_manager import \ CPUKVCacheManager -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import get_dtype_size, make_zmq_socket -else: - from vllm.utils.network_utils import make_zmq_socket - from vllm.utils.torch_utils import get_dtype_size @dataclass diff --git a/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py index 3aa49131..2ef8bf83 100644 --- a/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py +++ b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py @@ -33,17 +33,13 @@ from vllm.v1.request import Request, RequestStatus import vllm_ascend.envs as envs_ascend from vllm_ascend.distributed.utils import get_transfer_timeout_value from vllm_ascend.utils import (AscendSocVersion, get_ascend_soc_version, - prefill_context_parallel_enable, - vllm_version_is) + prefill_context_parallel_enable) if prefill_context_parallel_enable(): from vllm.distributed.parallel_state import \ get_prefill_context_model_parallel_rank -if vllm_version_is("0.11.0"): - from vllm.utils import get_ip -else: - from vllm.utils.network_utils import get_ip +from vllm.utils.network_utils import get_ip TORCH_DTYPE_TO_NPU_DTYPE = { torch.half: llm_datadist.DataType.DT_FLOAT16, diff --git a/vllm_ascend/distributed/mooncake/config_data.py b/vllm_ascend/distributed/mooncake/config_data.py index 4525fe16..2434b4db 100644 --- a/vllm_ascend/distributed/mooncake/config_data.py +++ b/vllm_ascend/distributed/mooncake/config_data.py @@ -10,14 +10,7 @@ import torch from vllm.distributed.kv_transfer.kv_connector.v1.base import \ KVConnectorMetadata from vllm.utils import logger - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import cdiv -else: - from vllm.utils.math_utils import cdiv - +from vllm.utils.math_utils import cdiv from vllm.v1.core.sched.output import NewRequestData DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB diff --git a/vllm_ascend/distributed/mooncake/mooncake_engine.py b/vllm_ascend/distributed/mooncake/mooncake_engine.py index ac00e22c..143d2c91 100644 --- a/vllm_ascend/distributed/mooncake/mooncake_engine.py +++ b/vllm_ascend/distributed/mooncake/mooncake_engine.py @@ -8,6 +8,7 @@ from typing import Generator, List, Optional, Union import torch from vllm.config import VllmConfig from vllm.utils import logger +from vllm.utils.torch_utils import get_kv_cache_torch_dtype from vllm_ascend.distributed.mooncake.config_data import ( ChunkedTokenDatabase, LasyerMultiBlockReqMeta, MooncakeConnectorMetadata, @@ -16,12 +17,6 @@ from vllm_ascend.distributed.mooncake.kv_transfer import ( KVCacheStoreLayerRecvingThread, KVCacheStoreLayerSendingThread, KVCacheStoreRecvingThread, KVCacheStoreSendingThread, KVTransferThread) from vllm_ascend.distributed.mooncake.mooncake_store import Mooncakestore -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import get_kv_cache_torch_dtype -else: - from vllm.utils.torch_utils import get_kv_cache_torch_dtype class MooncakeEngine: diff --git a/vllm_ascend/distributed/mooncake/mooncake_store.py b/vllm_ascend/distributed/mooncake/mooncake_store.py index ec885c59..01020d72 100644 --- a/vllm_ascend/distributed/mooncake/mooncake_store.py +++ b/vllm_ascend/distributed/mooncake/mooncake_store.py @@ -6,18 +6,13 @@ from mooncake.store import ReplicateConfig # type: ignore from vllm.config import ParallelConfig from vllm.distributed.parallel_state import get_tensor_model_parallel_rank from vllm.utils import logger +from vllm.utils.network_utils import get_ip from vllm_ascend.distributed.mooncake.config_data import MooncakeEngineKey from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te -from vllm_ascend.utils import vllm_version_is from .config_data import MooncakeStoreConfig -if vllm_version_is("0.11.0"): - from vllm.utils import get_ip -else: - from vllm.utils.network_utils import get_ip - METADATA_BYTES_LEN = 24 BASE_PORT = int(os.getenv("VLLM_BASE_PORT", "8790")) diff --git a/vllm_ascend/distributed/mooncake/mooncake_store_connector_v1.py b/vllm_ascend/distributed/mooncake/mooncake_store_connector_v1.py index f5473a3c..aad4dc6e 100644 --- a/vllm_ascend/distributed/mooncake/mooncake_store_connector_v1.py +++ b/vllm_ascend/distributed/mooncake/mooncake_store_connector_v1.py @@ -10,6 +10,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.forward_context import ForwardContext from vllm.utils import logger +from vllm.utils.network_utils import make_zmq_socket from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import Request @@ -18,12 +19,6 @@ from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder from vllm_ascend.distributed.mooncake.config_data import ( LoadSpec, MooncakeConnectorMetadata, ReqMeta, RequestTracker) from vllm_ascend.distributed.mooncake.mooncake_engine import MooncakeEngine -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import make_zmq_socket -else: - from vllm.utils.network_utils import make_zmq_socket class MooncakeConnectorV1(KVConnectorBase_V1): diff --git a/vllm_ascend/distributed/mooncake_connector.py b/vllm_ascend/distributed/mooncake_connector.py index cf3bbaa0..403b17e4 100644 --- a/vllm_ascend/distributed/mooncake_connector.py +++ b/vllm_ascend/distributed/mooncake_connector.py @@ -37,7 +37,7 @@ import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config from vllm_ascend.distributed.mooncake.transfer_engine import get_global_te from vllm_ascend.distributed.utils import get_transfer_timeout_value -from vllm_ascend.utils import prefill_context_parallel_enable, vllm_version_is +from vllm_ascend.utils import prefill_context_parallel_enable # isort: off if prefill_context_parallel_enable(): @@ -46,10 +46,7 @@ if prefill_context_parallel_enable(): ) # isort: on -if vllm_version_is("0.11.0"): - from vllm.utils import get_ip, make_zmq_path, make_zmq_socket -else: - from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket +from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata diff --git a/vllm_ascend/distributed/mooncake_layerwise_connector.py b/vllm_ascend/distributed/mooncake_layerwise_connector.py index 1c5c0a92..ccb6d344 100644 --- a/vllm_ascend/distributed/mooncake_layerwise_connector.py +++ b/vllm_ascend/distributed/mooncake_layerwise_connector.py @@ -28,6 +28,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank, get_tp_group, get_world_group) from vllm.utils import logger +from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket from vllm.v1.core.sched.output import SchedulerOutput import vllm_ascend.envs as envs_ascend @@ -35,12 +36,6 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.utils import (align_memory, get_transfer_timeout_value, kv_alltoall_and_rearrange) -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import get_ip, make_zmq_path, make_zmq_socket -else: - from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata diff --git a/vllm_ascend/kv_offload/cpu_npu.py b/vllm_ascend/kv_offload/cpu_npu.py index c19ec1b0..7fe5b878 100644 --- a/vllm_ascend/kv_offload/cpu_npu.py +++ b/vllm_ascend/kv_offload/cpu_npu.py @@ -2,17 +2,11 @@ import numpy as np import torch from vllm.attention import AttentionBackend from vllm.logger import init_logger +from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.worker import (OffloadingHandler, TransferResult, TransferSpec) -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import is_pin_memory_available -else: - from vllm.utils.platform_utils import is_pin_memory_available - logger = init_logger(__name__) diff --git a/vllm_ascend/lora/punica_npu.py b/vllm_ascend/lora/punica_npu.py index bf86501d..1ff9de60 100644 --- a/vllm_ascend/lora/punica_npu.py +++ b/vllm_ascend/lora/punica_npu.py @@ -349,64 +349,3 @@ class PunicaWrapperNPU(PunicaWrapperBase): bgmv_expand(buffer, lora_b_stacked, y, indices, add_inputs=True) y = y.view_as(y_org) - - -class PunicaWrapperNPU0110(PunicaWrapperNPU): - # NOTE: remove me when 0.11.0 id dropped - def add_lora_linear( # type: ignore[override] - self, - y: torch.Tensor, - x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, ...], - lora_b_stacked: Tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], - scale: float, - output_slices: Tuple[int, ...], - *, - buffer: Optional[Tuple[torch.Tensor, ...]] = None, - **kwargs) -> None: - """ - Applicable to linear-related lora. - - Semantics: - for i in range(len(lora_a_stacked)): - y[i] += ( - x[i].unsqueeze(0) - @ lora_a_stacked[indices[i], layer_idx, :, :] - @ lora_b_stacked[indices[i], layer_idx, :, :] - * scale - ).squeeze(0)+lora_bias_stacked[i] - - Args: - y (torch.Tensor): Output tensor. Will be changed in-place. - x (torch.Tensor): Input tensor - lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. - lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. - lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. - scale (float): Scaling factor. - output_slices (Tuple[int, ...]): Every slice's size. - buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. - """ - - assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) - if lora_bias_stacked is not None: - assert len(lora_bias_stacked) == len(output_slices) - y = self._apply_bias(self.token_lora_indices, y, output_slices, - lora_bias_stacked) - - if buffer is None: - r = lora_b_stacked[0].size(-1) - # We set the buffer to be float32 by default, consistent with the - # triton op - buffer = tuple( - torch.zeros( - (x.size(0), r), dtype=torch.float32, device=x.device) - for _ in range(len(output_slices))) - self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs) - self.add_expand(y, - buffer, - lora_b_stacked, - None, - output_slices, - add_inputs=True, - **kwargs) diff --git a/vllm_ascend/model_loader/netloader/netloader.py b/vllm_ascend/model_loader/netloader/netloader.py index d613d2a7..2968ee36 100644 --- a/vllm_ascend/model_loader/netloader/netloader.py +++ b/vllm_ascend/model_loader/netloader/netloader.py @@ -29,18 +29,12 @@ from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.default_loader import DefaultModelLoader from vllm.model_executor.model_loader.utils import ( initialize_model, process_weights_after_loading) - -from vllm_ascend.utils import vllm_version_is +from vllm.utils.torch_utils import set_default_torch_dtype from .interaction.elastic import ElasticServer from .load import elastic_load from .utils import find_free_port, is_valid_path_prefix -if vllm_version_is("0.11.0"): - from vllm.model_executor.model_loader.utils import set_default_torch_dtype -else: - from vllm.utils.torch_utils import set_default_torch_dtype - @register_model_loader("netloader") class ModelNetLoaderElastic(BaseModelLoader): @@ -207,10 +201,8 @@ class ModelNetLoaderElastic(BaseModelLoader): if model is not None and ( (self.listen_port and self.listen_port in range(1024, 65535)) or (self.listen_port is None)): - if vllm_version_is("0.11.0"): - from vllm.utils import get_ip - else: - from vllm.utils.network_utils import get_ip + + from vllm.utils.network_utils import get_ip driver_ip = get_ip() if driver_ip == '0.0.0.0': diff --git a/vllm_ascend/models/layers/mla.py b/vllm_ascend/models/layers/mla.py index 4ea4a27b..33049ffe 100644 --- a/vllm_ascend/models/layers/mla.py +++ b/vllm_ascend/models/layers/mla.py @@ -24,32 +24,16 @@ from typing import Optional import torch from torch import nn from vllm.attention import AttentionMetadata +from vllm.attention.layer import MLAAttention from vllm.config import CacheConfig, get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import ForwardContext, get_forward_context -from vllm.model_executor.layers.mla import MLAModules +from vllm.model_executor.layers.mla import (MLAModules, + MultiHeadLatentAttentionWrapper) from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.utils.torch_utils import direct_register_custom_op from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.attention import Attention - from vllm.model_executor.layers.mla import \ - MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper - from vllm.utils import direct_register_custom_op -else: - from vllm.attention.layer import MLAAttention - from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper - from vllm.utils.torch_utils import direct_register_custom_op - -if vllm_version_is("0.11.0"): - from vllm.attention import Attention - from vllm.model_executor.layers.mla import \ - MultiHeadLatentAttention as MultiHeadLatentAttentionWrapper -else: - from vllm.attention.layer import MLAAttention - from vllm.model_executor.layers.mla import MultiHeadLatentAttentionWrapper class IndexerWrapper(nn.Module): @@ -81,7 +65,6 @@ class IndexerWrapper(nn.Module): return -# TODO(whx): adapt v0.11.0 and DSA class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper): def __init__( @@ -119,61 +102,30 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper): ascend_indexer = IndexerWrapper(mla_modules.indexer) else: ascend_indexer = None - - if vllm_version_is("0.11.0"): - self.mla_attn = Attention( - num_heads=num_heads, - head_size=self.kv_lora_rank + self.qk_rope_head_dim, - scale=scale, - num_kv_heads=1, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - use_mla=True, - indexer=ascend_indexer, - use_sparse=mla_modules.is_sparse, - # MLA Args - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - v_head_dim=self.v_head_dim, - qk_head_dim=self.qk_head_dim, - rotary_emb=mla_modules.rotary_emb, - fused_qkv_a_proj=mla_modules.fused_qkv_a_proj, - q_b_proj=mla_modules.q_b_proj, - q_a_layernorm=mla_modules.q_a_layernorm, - q_proj=mla_modules.q_proj, - kv_a_proj_with_mqa=mla_modules.kv_a_proj_with_mqa, - kv_a_layernorm=mla_modules.kv_a_layernorm, - kv_b_proj=mla_modules.kv_b_proj, - o_proj=mla_modules.o_proj, - ) - else: - self.mla_attn = MLAAttention( - num_heads=num_heads, - scale=scale, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - v_head_dim=self.v_head_dim, - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - kv_b_proj=mla_modules.kv_b_proj, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - use_sparse=mla_modules.is_sparse, - indexer=ascend_indexer, - # extra args - rotary_emb=mla_modules.rotary_emb, - fused_qkv_a_proj=mla_modules.fused_qkv_a_proj, - q_b_proj=mla_modules.q_b_proj, - q_a_layernorm=mla_modules.q_a_layernorm, - q_proj=mla_modules.q_proj, - kv_a_proj_with_mqa=mla_modules.kv_a_proj_with_mqa, - kv_a_layernorm=mla_modules.kv_a_layernorm, - o_proj=mla_modules.o_proj, - ) + self.mla_attn = MLAAttention( + num_heads=num_heads, + scale=scale, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + v_head_dim=self.v_head_dim, + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + kv_b_proj=mla_modules.kv_b_proj, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + use_sparse=mla_modules.is_sparse, + indexer=ascend_indexer, + # extra args + rotary_emb=mla_modules.rotary_emb, + fused_qkv_a_proj=mla_modules.fused_qkv_a_proj, + q_b_proj=mla_modules.q_b_proj, + q_a_layernorm=mla_modules.q_a_layernorm, + q_proj=mla_modules.q_proj, + kv_a_proj_with_mqa=mla_modules.kv_a_proj_with_mqa, + kv_a_layernorm=mla_modules.kv_a_layernorm, + o_proj=mla_modules.o_proj, + ) compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py index 6f07afdc..b910708e 100644 --- a/vllm_ascend/models/qwen2_5_vl.py +++ b/vllm_ascend/models/qwen2_5_vl.py @@ -40,14 +40,11 @@ from vllm.model_executor.models.qwen2_5_vl import ( Qwen2_5_VLDummyInputsBuilder, Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLMultiModalProcessor, Qwen2_5_VLProcessingInfo) from vllm.model_executor.models.utils import maybe_prefix +from vllm.model_executor.models.vision import conv3d_to_linear_weight from vllm.multimodal import MULTIMODAL_REGISTRY from vllm_ascend.ascend_forward_context import set_ascend_forward_context -from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz, - vllm_version_is) - -if not vllm_version_is("0.11.0"): - from vllm.model_executor.models.vision import conv3d_to_linear_weight +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -360,9 +357,8 @@ class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer): params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() for name, loaded_weight in weights: - if not vllm_version_is("0.11.0"): - if name.endswith("patch_embed.proj.weight"): - loaded_weight = conv3d_to_linear_weight(loaded_weight) + if name.endswith("patch_embed.proj.weight"): + loaded_weight = conv3d_to_linear_weight(loaded_weight) for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue @@ -537,11 +533,8 @@ class AscendQwen2_5_VLForConditionalGeneration( image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) - if vllm_version_is("0.11.0"): + with set_ascend_forward_context(None, self.vllm_config): image_embeds = self.visual(pixel_values, grid_thw=grid_thw) - else: - with set_ascend_forward_context(None, self.vllm_config): - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) # Split concatenated embeddings for each image item. merge_size = self.visual.spatial_merge_size @@ -558,13 +551,9 @@ class AscendQwen2_5_VLForConditionalGeneration( else: pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype) - if vllm_version_is("0.11.0"): + with set_ascend_forward_context(None, self.vllm_config): video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) - else: - with set_ascend_forward_context(None, self.vllm_config): - video_embeds = self.visual(pixel_values_videos, - grid_thw=grid_thw) # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py index 7b1ce44a..f24f9823 100644 --- a/vllm_ascend/models/qwen2_vl.py +++ b/vllm_ascend/models/qwen2_vl.py @@ -38,13 +38,10 @@ from vllm.model_executor.models.qwen2_vl import ( Qwen2VLForConditionalGeneration, Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo) from vllm.model_executor.models.utils import maybe_prefix +from vllm.model_executor.models.vision import conv3d_to_linear_weight from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz, - vllm_version_is) - -if not vllm_version_is("0.11.0"): - from vllm.model_executor.models.vision import conv3d_to_linear_weight +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -308,9 +305,8 @@ class AscendQwen2VisionTransformer(Qwen2VisionTransformer): loaded_params: Set[str] = set() for name, loaded_weight in weights: - if not vllm_version_is("0.11.0"): - if name.endswith("patch_embed.proj.weight"): - loaded_weight = conv3d_to_linear_weight(loaded_weight) + if name.endswith("patch_embed.proj.weight"): + loaded_weight = conv3d_to_linear_weight(loaded_weight) for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: diff --git a/vllm_ascend/models/qwen3_next.py b/vllm_ascend/models/qwen3_next.py index 622efe23..2f3585ea 100644 --- a/vllm_ascend/models/qwen3_next.py +++ b/vllm_ascend/models/qwen3_next.py @@ -50,8 +50,6 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.transformers_utils.configs import Qwen3NextConfig from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata -from vllm_ascend.utils import vllm_version_is - from vllm.model_executor.models.qwen3_next import ( # isort: skip Qwen3NextAttention, Qwen3NextDecoderLayer, Qwen3NextForCausalLM, Qwen3NextGatedDeltaNet, Qwen3NextModel, Qwen3NextSparseMoeBlock, @@ -202,11 +200,8 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase): spec_query_start_loc = attn_metadata.spec_query_start_loc non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc spec_sequence_masks = attn_metadata.spec_sequence_masks - if vllm_version_is("0.11.0"): - spec_token_masks = attn_metadata.spec_token_masks - else: - spec_token_indx = attn_metadata.spec_token_indx - non_spec_token_indx = attn_metadata.non_spec_token_indx + spec_token_indx = attn_metadata.spec_token_indx + non_spec_token_indx = attn_metadata.non_spec_token_indx spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501 non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501 self_kv_cache = self.kv_cache[forward_context.virtual_engine] @@ -221,9 +216,6 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase): # 1. Set up dimensions for reshapes later projected_states, _ = self.in_proj(hidden_states[:num_actual_tokens]) - if vllm_version_is("0.11.0"): - if spec_token_masks is not None: - spec_token_masks = spec_token_masks[:num_actual_tokens] projected_states_qkvz, projected_states_ba = torch.split( projected_states, [ @@ -248,13 +240,9 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase): mixed_qkv_spec = mixed_qkv mixed_qkv_non_spec = None else: - if vllm_version_is("0.11.0"): - mixed_qkv_spec = mixed_qkv[spec_token_masks] - mixed_qkv_non_spec = mixed_qkv[~spec_token_masks] - else: - mixed_qkv_spec = mixed_qkv.index_select(0, spec_token_indx) - mixed_qkv_non_spec = mixed_qkv.index_select( - 0, non_spec_token_indx) + mixed_qkv_spec = mixed_qkv.index_select(0, spec_token_indx) + mixed_qkv_non_spec = mixed_qkv.index_select( + 0, non_spec_token_indx) else: mixed_qkv_spec = None mixed_qkv_non_spec = mixed_qkv @@ -322,16 +310,10 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase): g_non_spec = None beta_non_spec = None else: - if vllm_version_is("0.11.0"): - g_spec = g[:, spec_token_masks] - beta_spec = beta[:, spec_token_masks] - g_non_spec = g[:, ~spec_token_masks] - beta_non_spec = beta[:, ~spec_token_masks] - else: - g_spec = g.index_select(1, spec_token_indx) - beta_spec = beta.index_select(1, spec_token_indx) - g_non_spec = g.index_select(1, non_spec_token_indx) - beta_non_spec = beta.index_select(1, non_spec_token_indx) + g_spec = g.index_select(1, spec_token_indx) + beta_spec = beta.index_select(1, spec_token_indx) + g_non_spec = g.index_select(1, non_spec_token_indx) + beta_non_spec = beta.index_select(1, non_spec_token_indx) else: g_spec = None beta_spec = None @@ -439,14 +421,9 @@ class CustomQwen3NextGatedDeltaNet(Qwen3NextGatedDeltaNet, MambaBase): dtype=core_attn_out_non_spec.dtype, device=core_attn_out_non_spec.device, ) - if vllm_version_is("0.11.0"): - core_attn_out[:, spec_token_masks] = core_attn_out_spec - core_attn_out[:, ~spec_token_masks] = core_attn_out_non_spec - else: - core_attn_out.index_copy_(1, spec_token_indx, - core_attn_out_spec) - core_attn_out.index_copy_(1, non_spec_token_indx, - core_attn_out_non_spec) + core_attn_out.index_copy_(1, spec_token_indx, core_attn_out_spec) + core_attn_out.index_copy_(1, non_spec_token_indx, + core_attn_out_non_spec) elif spec_sequence_masks is not None: core_attn_out = core_attn_out_spec else: diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index a70a4cdf..4788c87d 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -19,7 +19,7 @@ from typing import Any, Callable, Optional import torch import torch_npu -from vllm.config import get_current_vllm_config +from vllm.config import CompilationMode, get_current_vllm_config from vllm.distributed import (get_dp_group, get_ep_group, get_tp_group, tensor_model_parallel_all_reduce) from vllm.forward_context import get_forward_context @@ -28,6 +28,8 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map, get_compressed_expert_map) +from vllm.model_executor.layers.fused_moe.shared_fused_moe import \ + SharedFusedMoE from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_forward_context import MoECommType @@ -44,17 +46,7 @@ from vllm_ascend.quantization.w8a8_dynamic import \ from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, enable_sp, is_310p, is_enable_nz, npu_stream_switch, shared_expert_dp_enabled, - shared_experts_calculation_stream, - vllm_version_is) - -if vllm_version_is("0.11.0"): - from vllm.config import CompilationLevel - - from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE # type: ignore # isort:skip -else: - from vllm.config import CompilationMode - from vllm.model_executor.layers.fused_moe.shared_fused_moe import \ - SharedFusedMoE + shared_experts_calculation_stream) class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): @@ -73,16 +65,9 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): if ascend_config.torchair_graph_config.enabled: self.use_aclgraph = False else: - if vllm_version_is("0.11.0"): - self.use_aclgraph = ( - vllm_config.compilation_config.level - == CompilationLevel.PIECEWISE - and not vllm_config.model_config.enforce_eager) - else: - self.use_aclgraph = ( - vllm_config.compilation_config.mode - == CompilationMode.VLLM_COMPILE - and not vllm_config.model_config.enforce_eager) + self.use_aclgraph = (vllm_config.compilation_config.mode + == CompilationMode.VLLM_COMPILE and + not vllm_config.model_config.enforce_eager) self.transpose = True @@ -209,12 +194,8 @@ class AscendFusedMoE(FusedMoE): dtype=vllm_config.model_config.dtype) # init moe. - if vllm_version_is("0.11.0"): - self.local_num_experts, self.expert_map = determine_expert_map( - self.ep_size, self.ep_rank, self.global_num_experts) - else: - self.local_num_experts, self.expert_map, _ = determine_expert_map( - self.ep_size, self.ep_rank, self.global_num_experts) + self.local_num_experts, self.expert_map, _ = determine_expert_map( + self.ep_size, self.ep_rank, self.global_num_experts) # static eplb initializing with expert_map_path if self.expert_map_path and os.path.exists( self.expert_map_path) and os.access(self.expert_map_path, diff --git a/vllm_ascend/ops/register_custom_ops.py b/vllm_ascend/ops/register_custom_ops.py index 6a3057d9..bb16bc00 100644 --- a/vllm_ascend/ops/register_custom_ops.py +++ b/vllm_ascend/ops/register_custom_ops.py @@ -7,17 +7,12 @@ from vllm.distributed import (get_dp_group, get_ep_group, tensor_model_parallel_all_reduce, tensor_model_parallel_reduce_scatter) from vllm.forward_context import get_forward_context +from vllm.utils.torch_utils import direct_register_custom_op import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_forward_context import MoECommType from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch -from vllm_ascend.utils import (npu_stream_switch, prefetch_stream, - vllm_version_is) - -if vllm_version_is("0.11.0"): - from vllm.utils import direct_register_custom_op -else: - from vllm.utils.torch_utils import direct_register_custom_op +from vllm_ascend.utils import npu_stream_switch, prefetch_stream def _maybe_all_gather_and_maybe_unpad_impl( diff --git a/vllm_ascend/patch/platform/patch_mamba_config.py b/vllm_ascend/patch/platform/patch_mamba_config.py index 1c35106e..18939b0f 100644 --- a/vllm_ascend/patch/platform/patch_mamba_config.py +++ b/vllm_ascend/patch/platform/patch_mamba_config.py @@ -3,23 +3,10 @@ import vllm.model_executor.models.config from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models.config import MambaModelConfig - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import cdiv -else: - from vllm.utils.math_utils import cdiv - +from vllm.utils.math_utils import cdiv +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -else: - from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE - @classmethod def verify_and_update_config(cls, vllm_config) -> None: diff --git a/vllm_ascend/patch/platform/patch_multiproc_executor.py b/vllm_ascend/patch/platform/patch_multiproc_executor.py index ac821e0e..db400422 100644 --- a/vllm_ascend/patch/platform/patch_multiproc_executor.py +++ b/vllm_ascend/patch/platform/patch_multiproc_executor.py @@ -8,21 +8,14 @@ import vllm.v1.executor.multiproc_executor from vllm import envs from vllm.config import VllmConfig from vllm.distributed.device_communicators.shm_broadcast import MessageQueue +from vllm.utils.network_utils import (get_distributed_init_method, + get_loopback_ip, get_open_port) +from vllm.utils.system_utils import get_mp_context from vllm.v1.executor.abstract import FailureCallback from vllm.v1.executor.multiproc_executor import ( MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc, set_multiprocessing_worker_envs) -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import (get_distributed_init_method, get_loopback_ip, - get_mp_context, get_open_port) -else: - from vllm.utils.network_utils import (get_distributed_init_method, - get_loopback_ip, get_open_port) - from vllm.utils.system_utils import get_mp_context - class AscendMultiprocExecutor(MultiprocExecutor): supports_pp: bool = True diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index 846c4832..43a2e800 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -28,9 +28,3 @@ import vllm_ascend.patch.worker.patch_roberta # noqa import vllm_ascend.patch.worker.patch_weight_loader # noqa import vllm_ascend.patch.worker.patch_multimodal_merge # noqa import vllm_ascend.patch.worker.patch_minicpm # noqa - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - import vllm_ascend.patch.worker.patch_deepseek_mtp # noqa - import vllm_ascend.patch.worker.patch_deepseek_v3_2 # noqa diff --git a/vllm_ascend/patch/worker/patch_deepseek_mtp.py b/vllm_ascend/patch/worker/patch_deepseek_mtp.py deleted file mode 100644 index c4df4d50..00000000 --- a/vllm_ascend/patch/worker/patch_deepseek_mtp.py +++ /dev/null @@ -1,94 +0,0 @@ -import torch -import torch.nn as nn -from transformers import PretrainedConfig -from vllm.config import VllmConfig -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.models.deepseek_mtp import \ - DeepSeekMultiTokenPredictorLayer -from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer -from vllm.model_executor.models.utils import maybe_prefix - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.compilation.decorators import support_torch_compile - from vllm.model_executor.models.deepseek_mtp import DeepSeekMTP - - -class SharedHead(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - prefix: str, - quant_config: QuantizationConfig = None, - ) -> None: - super().__init__() - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.head = ParallelLMHead( - config.vocab_size, - config.hidden_size, - quant_config=quant_config, - prefix=maybe_prefix(prefix, "head"), - ) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - return self.norm(hidden_states) - - -def predictor_init(self, vllm_config: VllmConfig, prefix: str) -> None: - nn.Module.__init__(self) - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - - self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.eh_proj = nn.Linear(config.hidden_size * 2, - config.hidden_size, - bias=False) - # We don't need topk_indices_buffer in Ascend - topk_indices_buffer = None - self.shared_head = SharedHead(config=config, - prefix=prefix, - quant_config=quant_config) - self.mtp_block = DeepseekV2DecoderLayer(vllm_config, prefix, - topk_indices_buffer) - - -def predictor_forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - previous_hidden_states: torch.Tensor, - inputs_embeds: torch.Tensor | None = None, - spec_step_index: int = 0, -) -> torch.Tensor: - assert inputs_embeds is not None - # masking inputs at position 0, as not needed by MTP - inputs_embeds = torch.where(positions.unsqueeze(-1) == 0, 0, inputs_embeds) - inputs_embeds = self.enorm(inputs_embeds) - previous_hidden_states = self.hnorm(previous_hidden_states) - - hidden_states = self.eh_proj( - torch.cat([inputs_embeds, previous_hidden_states], dim=-1)) - - hidden_states, residual = self.mtp_block(positions=positions, - hidden_states=hidden_states, - residual=None) - hidden_states = residual + hidden_states - return hidden_states - - -# Patch this only for aclgraph support, as this is not support in vLLM 0.11.0 -@support_torch_compile -class AscendDeepSeekMTP(DeepSeekMTP): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config=vllm_config, prefix=prefix) - - -DeepSeekMultiTokenPredictorLayer.__init__ = predictor_init -if vllm_version_is("0.11.0"): - DeepSeekMultiTokenPredictorLayer.forward = predictor_forward diff --git a/vllm_ascend/patch/worker/patch_deepseek_v3_2.py b/vllm_ascend/patch/worker/patch_deepseek_v3_2.py deleted file mode 100644 index cdafcb67..00000000 --- a/vllm_ascend/patch/worker/patch_deepseek_v3_2.py +++ /dev/null @@ -1,108 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from itertools import islice -from typing import Optional, Union - -import torch -import vllm.model_executor.models.deepseek_v2 -from torch import nn -from vllm.compilation.decorators import support_torch_compile -from vllm.config import VllmConfig -from vllm.distributed import get_pp_group -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.vocab_parallel_embedding import \ - VocabParallelEmbedding -from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer -from vllm.model_executor.models.utils import ( - PPMissingLayer, make_empty_intermediate_tensors_factory, make_layers) -from vllm.sequence import IntermediateTensors - - -@support_torch_compile -class DeepseekV2Model(nn.Module): - - fall_back_to_pt_during_load = False - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - - self.vocab_size = config.vocab_size - self.is_v32 = hasattr(config, "index_topk") - topk_indices_buffer = None - - if get_pp_group().is_first_rank: - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - quant_config=quant_config, - prefix=f"{prefix}.embed_tokens") - else: - self.embed_tokens = PPMissingLayer() - - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: DeepseekV2DecoderLayer(vllm_config, prefix, - topk_indices_buffer), - prefix=f"{prefix}.layers") - - if get_pp_group().is_last_rank: - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - else: - self.norm = PPMissingLayer() - self.make_empty_intermediate_tensors = ( - make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size)) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors], - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.get_input_embeddings(input_ids) - residual = None - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] - - for layer in islice(self.layers, self.start_layer, self.end_layer): - hidden_states, residual = layer(positions, hidden_states, residual) - - if not get_pp_group().is_last_rank: - return IntermediateTensors({ - "hidden_states": hidden_states, - "residual": residual - }) - - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -vllm.model_executor.models.deepseek_v2.DeepseekV2Model = DeepseekV2Model diff --git a/vllm_ascend/patch/worker/patch_triton.py b/vllm_ascend/patch/worker/patch_triton.py index 0383da9e..cc550ccc 100644 --- a/vllm_ascend/patch/worker/patch_triton.py +++ b/vllm_ascend/patch/worker/patch_triton.py @@ -6,16 +6,11 @@ import vllm.model_executor.layers.mamba.ops.causal_conv1d from vllm_ascend.ops.casual_conv1d import (causal_conv1d_fn, causal_conv1d_update_npu) from vllm_ascend.ops.fla import LayerNormFn, torch_chunk_gated_delta_rule -from vllm_ascend.ops.sigmoid_gating import ( - fused_recurrent_gated_delta_rule_fwd_kernel, - fused_recurrent_gated_delta_rule_fwd_kernel_0_11_0) -from vllm_ascend.utils import vllm_version_is +from vllm_ascend.ops.sigmoid_gating import \ + fused_recurrent_gated_delta_rule_fwd_kernel vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_update = causal_conv1d_update_npu vllm.model_executor.layers.mamba.ops.causal_conv1d.causal_conv1d_fn = causal_conv1d_fn -if vllm_version_is('0.11.0'): - vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel_0_11_0 -else: - vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel +vllm.model_executor.layers.fla.ops.fused_recurrent.fused_recurrent_gated_delta_rule_fwd_kernel = fused_recurrent_gated_delta_rule_fwd_kernel vllm.model_executor.layers.fla.ops.layernorm_guard.LayerNormFn = LayerNormFn vllm.model_executor.layers.fla.ops.chunk.chunk_gated_delta_rule = torch_chunk_gated_delta_rule diff --git a/vllm_ascend/patch/worker/patch_weight_loader.py b/vllm_ascend/patch/worker/patch_weight_loader.py index cbbace8b..e0fcde04 100644 --- a/vllm_ascend/patch/worker/patch_weight_loader.py +++ b/vllm_ascend/patch/worker/patch_weight_loader.py @@ -3,13 +3,7 @@ from torch.nn.parameter import Parameter from vllm.logger import init_logger from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.utils import set_weight_attrs - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import GiB_bytes -else: - from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.mem_constants import GiB_bytes logger = init_logger(__name__) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 873862bb..cab69d0e 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -34,7 +34,7 @@ from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p, is_vl_model, prefill_context_parallel_enable, update_aclgraph_sizes, update_cudagraph_capture_sizes, - update_default_aclgraph_sizes, vllm_version_is) + update_default_aclgraph_sizes) if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig @@ -120,10 +120,7 @@ class NPUPlatform(Platform): # initialize ascend config from vllm additional_config ascend_config = init_ascend_config(vllm_config) - if vllm_version_is("0.11.0"): - from vllm.config import CompilationLevel - else: - from vllm.config import CompilationMode # noqa: E402 + from vllm.config import CompilationMode # noqa: E402 compilation_config = vllm_config.compilation_config model_config = vllm_config.model_config @@ -149,29 +146,17 @@ class NPUPlatform(Platform): from vllm.config.compilation import CUDAGraphMode if enforce_eager: logger.info("Compilation disabled, using eager mode by default") - if vllm_version_is("0.11.0"): - compilation_config.level = CompilationLevel.NO_COMPILATION - else: - compilation_config.mode = CompilationMode.NONE + compilation_config.mode = CompilationMode.NONE compilation_config.cudagraph_num_of_warmups = 1 - if vllm_version_is("0.11.0"): - if compilation_config.level not in [ - CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE - ]: - logger.warning( - "NPU does not support %s compilation level. Setting CUDAGraphMode to NONE", - compilation_config.level) - compilation_config.cudagraph_mode = CUDAGraphMode.NONE - else: - if compilation_config.mode not in [ - CompilationMode.NONE, CompilationMode.VLLM_COMPILE - ]: - logger.warning( - "NPU does not support %s compilation mode. Setting CUDAGraphMode to NONE", - compilation_config.mode) - compilation_config.cudagraph_mode = CUDAGraphMode.NONE + if compilation_config.mode not in [ + CompilationMode.NONE, CompilationMode.VLLM_COMPILE + ]: + logger.warning( + "NPU does not support %s compilation mode. Setting CUDAGraphMode to NONE", + compilation_config.mode) + compilation_config.cudagraph_mode = CUDAGraphMode.NONE # set CUDAGraphMode to None when torchair is enabled, no mather what compilation_config.level is. if ascend_config.torchair_graph_config.enabled: @@ -211,96 +196,49 @@ class NPUPlatform(Platform): f"{vllm_config.parallel_config.tensor_parallel_size}") if len(sp_aclgraph_sizes) != len(original_sizes): compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes - if vllm_version_is("0.11.0"): - compilation_config.init_with_cudagraph_sizes( - sp_aclgraph_sizes) - else: - update_cudagraph_capture_sizes(vllm_config, - sp_aclgraph_sizes) + update_cudagraph_capture_sizes(vllm_config, sp_aclgraph_sizes) # TODO: Full graph is fully supported later, and the default value will be set to full graph. if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE - if vllm_version_is("0.11.0"): - if compilation_config.cudagraph_mode == CUDAGraphMode.NONE: - compilation_config.level = CompilationLevel.NO_COMPILATION - elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE: - logger.info( - "PIECEWISE compilation enabled on NPU. use_inductor not supported - " - "using only ACL Graph mode") - assert compilation_config.level == CompilationLevel.PIECEWISE, \ - "When enabling piecewise aclgraph, please make sure compilation_config.level == CompilationLevel.PIECEWISE and compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE" - compilation_config.set_splitting_ops_for_v1() - compilation_config.use_inductor = False - compilation_config.splitting_ops.extend([ - "vllm.unified_ascend_attention_with_output", - "vllm.mla_forward" - ]) - update_aclgraph_sizes(vllm_config) - elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY or\ - compilation_config.cudagraph_mode == CUDAGraphMode.FULL: - logger.info( - "FULL_DECODE_ONLY compilation enabled on NPU. use_inductor not supported - " - "using only ACL Graph mode") - compilation_config.use_inductor = False - warning_message = """\033[91m - ********************************************************************************** - * WARNING: You have enabled the *full graph* feature. - * This is an early experimental stage and may involve various unknown issues. - * A known problem is that capturing too many batch sizes can lead to OOM - * (Out of Memory) errors or inference hangs. If you encounter such issues, - * consider reducing `gpu_memory_utilization` or manually specifying a smaller - * batch size for graph capture. - * For more details, please refer to: - * https://docs.vllm.ai/en/stable/configuration/conserving_memory.html#reduce-cuda-graphs - **********************************************************************************\033[0m - """ - logger.warning(warning_message) - else: - logger.info( - "%s cudagraph_mode is not support on NPU. falling back to NONE", - compilation_config.cudagraph_mode) - compilation_config.cudagraph_mode = CUDAGraphMode.NONE - compilation_config.level = CompilationLevel.NO_COMPILATION + if compilation_config.cudagraph_mode == CUDAGraphMode.NONE: + compilation_config.mode = CompilationMode.NONE + elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE: + logger.info( + "PIECEWISE compilation enabled on NPU. use_inductor not supported - " + "using only ACL Graph mode") + assert compilation_config.mode == CompilationMode.VLLM_COMPILE, \ + "When enabling VLLM_COMPILE aclgraph, please make sure compilation_config.mode == CompilationMode.VLLM_COMPILE and compilation_config.cudagraph_mode == CUDAGraphMode.VLLM_COMPILE" + compilation_config.set_splitting_ops_for_v1() + compilation_config.use_inductor = False + compilation_config.splitting_ops.extend(["vllm::mla_forward"]) + update_aclgraph_sizes(vllm_config) + elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY or\ + compilation_config.cudagraph_mode == CUDAGraphMode.FULL: + logger.info( + "FULL_DECODE_ONLY compilation enabled on NPU. use_inductor not supported - " + "using only ACL Graph mode") + compilation_config.use_inductor = False + warning_message = """\033[91m + ********************************************************************************** + * WARNING: You have enabled the *full graph* feature. + * This is an early experimental stage and may involve various unknown issues. + * A known problem is that capturing too many batch sizes can lead to OOM + * (Out of Memory) errors or inference hangs. If you encounter such issues, + * consider reducing `gpu_memory_utilization` or manually specifying a smaller + * batch size for graph capture. + * For more details, please refer to: + * https://docs.vllm.ai/en/stable/configuration/conserving_memory.html#reduce-cuda-graphs + **********************************************************************************\033[0m + """ + logger.warning(warning_message) else: - if compilation_config.cudagraph_mode == CUDAGraphMode.NONE: - compilation_config.mode = CompilationMode.NONE - elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE: - logger.info( - "PIECEWISE compilation enabled on NPU. use_inductor not supported - " - "using only ACL Graph mode") - assert compilation_config.mode == CompilationMode.VLLM_COMPILE, \ - "When enabling VLLM_COMPILE aclgraph, please make sure compilation_config.mode == CompilationMode.VLLM_COMPILE and compilation_config.cudagraph_mode == CUDAGraphMode.VLLM_COMPILE" - compilation_config.set_splitting_ops_for_v1() - compilation_config.use_inductor = False - compilation_config.splitting_ops.extend(["vllm::mla_forward"]) - update_aclgraph_sizes(vllm_config) - elif compilation_config.cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY or\ - compilation_config.cudagraph_mode == CUDAGraphMode.FULL: - logger.info( - "FULL_DECODE_ONLY compilation enabled on NPU. use_inductor not supported - " - "using only ACL Graph mode") - compilation_config.use_inductor = False - warning_message = """\033[91m - ********************************************************************************** - * WARNING: You have enabled the *full graph* feature. - * This is an early experimental stage and may involve various unknown issues. - * A known problem is that capturing too many batch sizes can lead to OOM - * (Out of Memory) errors or inference hangs. If you encounter such issues, - * consider reducing `gpu_memory_utilization` or manually specifying a smaller - * batch size for graph capture. - * For more details, please refer to: - * https://docs.vllm.ai/en/stable/configuration/conserving_memory.html#reduce-cuda-graphs - **********************************************************************************\033[0m - """ - logger.warning(warning_message) - else: - logger.info( - "%s cudagraph_mode is not support on NPU. falling back to NONE", - compilation_config.cudagraph_mode) - compilation_config.cudagraph_mode = CUDAGraphMode.NONE - compilation_config.mode = CompilationMode.NONE + logger.info( + "%s cudagraph_mode is not support on NPU. falling back to NONE", + compilation_config.cudagraph_mode) + compilation_config.cudagraph_mode = CUDAGraphMode.NONE + compilation_config.mode = CompilationMode.NONE # TODO: Remove this check when ACL Graph supports ASCEND_LAUNCH_BLOCKING=1 # Then, we will have to discuss the error handling strategy and user experience @@ -315,10 +253,7 @@ class NPUPlatform(Platform): if parallel_config and parallel_config.worker_cls == "auto": # TODO: this is a tricky way to disable `use_sequence_parallel_moe` in vllm. - if vllm_version_is("0.11.0"): - os.environ["VLLM_ALL2ALL_BACKEND"] = "flashinfer_all2allv" - else: - parallel_config.all2all_backend = "flashinfer_all2allv" + parallel_config.all2all_backend = "flashinfer_all2allv" if ascend_config.torchair_graph_config.enabled or ascend_config.enable_shared_expert_dp: parallel_config.worker_cls = "vllm_ascend.torchair.torchair_worker.NPUTorchairWorker" else: @@ -443,10 +378,7 @@ class NPUPlatform(Platform): @classmethod def get_punica_wrapper(cls) -> str: - if vllm_version_is("0.11.0"): - return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU0110" - else: - return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU" + return "vllm_ascend.lora.punica_npu.PunicaWrapperNPU" @classmethod def get_current_memory_usage(cls, diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 54dc0f43..6b7d6b08 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -19,20 +19,14 @@ from typing import Any, Callable, Dict, Optional, Tuple, Union import torch import torch_npu -from vllm.config import get_current_vllm_config +from vllm.config import CompilationMode, get_current_vllm_config from vllm.distributed import get_ep_group from vllm.forward_context import get_forward_context from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_mc2_group from vllm_ascend.ops.fused_moe.experts_selector import select_experts -from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, is_enable_nz, - vllm_version_is) - -if vllm_version_is("0.11.0"): - from vllm.config import CompilationLevel -else: - from vllm.config import CompilationMode +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz class AscendW8A8DynamicLinearMethod: @@ -129,18 +123,10 @@ class AscendW8A8DynamicFusedMoEMethod: vllm_config = get_current_vllm_config() ascend_config = get_ascend_config() - if vllm_version_is("0.11.0"): - self.use_aclgraph = ( - vllm_config.compilation_config.level - == CompilationLevel.PIECEWISE - and not vllm_config.model_config.enforce_eager - and not ascend_config.torchair_graph_config.enabled) - else: - self.use_aclgraph = ( - vllm_config.compilation_config.mode - == CompilationMode.VLLM_COMPILE - and not vllm_config.model_config.enforce_eager - and not ascend_config.torchair_graph_config.enabled) + self.use_aclgraph = ( + vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE + and not vllm_config.model_config.enforce_eager + and not ascend_config.torchair_graph_config.enabled) self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path self.in_dtype = vllm_config.model_config.dtype diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py index 0bf8b6bf..a17f5340 100644 --- a/vllm_ascend/sample/rejection_sampler.py +++ b/vllm_ascend/sample/rejection_sampler.py @@ -6,16 +6,10 @@ import torch.nn as nn import vllm.v1.sample.rejection_sampler as rs from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import (RejectionSampler, + apply_sampling_constraints, generate_uniform_probs) from vllm.v1.spec_decode.metadata import SpecDecodeMetadata -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.v1.sample.rejection_sampler import compute_probs -else: - from vllm.v1.sample.rejection_sampler import apply_sampling_constraints - PLACEHOLDER_TOKEN_ID = -1 GREEDY_TEMPERATURE = -1 # Maximum number of speculative draft tokens allowed per request in a single @@ -89,19 +83,12 @@ class AscendRejectionSampler(RejectionSampler, nn.Module): # [num_tokens, vocab_size] # NOTE(woosuk): `target_logits` can be updated in place inside the # `compute_probs` function. - if vllm_version_is("0.11.0"): - target_probs = compute_probs( - target_logits, - metadata.cu_num_draft_tokens, - sampling_metadata, - ) - else: - target_logits = apply_sampling_constraints( - target_logits, - metadata.cu_num_draft_tokens, - sampling_metadata, - ) - target_probs = target_logits.softmax(dim=-1, dtype=torch.float32) + target_logits = apply_sampling_constraints( + target_logits, + metadata.cu_num_draft_tokens, + sampling_metadata, + ) + target_probs = target_logits.softmax(dim=-1, dtype=torch.float32) output_token_ids = rejection_sample( metadata.draft_token_ids, diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 74e29178..d3be2ea9 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -5,13 +5,15 @@ import numpy as np import torch import torch.nn as nn from vllm.attention.layer import Attention -from vllm.config import CUDAGraphMode, VllmConfig, get_layers_from_vllm_config +from vllm.config import (CompilationMode, CUDAGraphMode, VllmConfig, + get_layers_from_vllm_config) from vllm.distributed.parallel_state import get_pp_group from vllm.logger import logger from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM +from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.spec_decode.metadata import SpecDecodeMetadata @@ -22,14 +24,6 @@ from vllm_ascend.attention.attention_v1 import (AscendAttentionState, AscendMetadata) from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.config import CompilationLevel - from vllm.utils import is_pin_memory_available -else: - from vllm.config import CompilationMode - from vllm.utils.platform_utils import is_pin_memory_available PADDING_SLOT_ID = -1 @@ -52,16 +46,9 @@ class EagleProposer(Proposer): self.hidden_size = vllm_config.speculative_config.draft_model_config.get_hidden_size( ) - if vllm_version_is("0.11.0"): - self.use_cuda_graph = ( - self.vllm_config.compilation_config.level - == CompilationLevel.PIECEWISE - and not self.vllm_config.model_config.enforce_eager) - else: - self.use_cuda_graph = ( - self.vllm_config.compilation_config.mode - == CompilationMode.VLLM_COMPILE - and not self.vllm_config.model_config.enforce_eager) + self.use_cuda_graph = (self.vllm_config.compilation_config.mode + == CompilationMode.VLLM_COMPILE and + not self.vllm_config.model_config.enforce_eager) self.cudagraph_batch_sizes = list( reversed( diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index ea2889ec..7aa9b729 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -15,14 +15,7 @@ from vllm.model_executor.model_loader.utils import \ process_weights_after_loading from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import cdiv -else: - from vllm.utils.math_utils import cdiv - +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, CommonAttentionMetadata) from vllm.v1.core.sched.output import SchedulerOutput @@ -39,31 +32,21 @@ from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper, update_mla_attn_params) from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable, - prefill_context_parallel_enable, - vllm_version_is) + prefill_context_parallel_enable) if prefill_context_parallel_enable(): from vllm.distributed import get_pcp_group -if vllm_version_is("0.11.0"): - from vllm.model_executor.model_loader.utils import set_default_torch_dtype - from vllm.utils import is_pin_memory_available -else: - from vllm.utils.platform_utils import is_pin_memory_available - from vllm.utils.torch_utils import set_default_torch_dtype +from vllm.utils.platform_utils import is_pin_memory_available +from vllm.utils.torch_utils import set_default_torch_dtype logger = init_logger(__name__) PADDING_SLOT_ID = -1 -_deepseek_mtp_path = "vllm.model_executor.models.deepseek_mtp" -_deepseek_mtp_model = "DeepSeekMTP" -if vllm_version_is("0.11.0"): - _deepseek_mtp_path = "vllm_ascend.patch.worker.patch_deepseek_mtp" - _deepseek_mtp_model = "AscendDeepSeekMTP" - _MTP_MODELS = { - "DeepseekV3ForCausalLM": (_deepseek_mtp_path, _deepseek_mtp_model), + "DeepseekV3ForCausalLM": + ("vllm.model_executor.models.deepseek_mtp", "DeepSeekMTP"), "Qwen3NextForCausalLM": ("vllm_ascend.models.qwen3_next_mtp", "CustomQwen3NextMTP") } diff --git a/vllm_ascend/torchair/models/qwen3_moe.py b/vllm_ascend/torchair/models/qwen3_moe.py index 3ea3a56f..f6adc93e 100644 --- a/vllm_ascend/torchair/models/qwen3_moe.py +++ b/vllm_ascend/torchair/models/qwen3_moe.py @@ -23,7 +23,7 @@ from torch import nn from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, CompilationMode, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, get_tp_group) @@ -55,12 +55,6 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.torchair.ops.sequence_parallel import (MetadataForPadding, init_metadata_for_sp) from vllm_ascend.torchair.ops.torchair_fused_moe import TorchairAscendFusedMoE -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.config import CompilationLevel -else: - from vllm.config import CompilationMode class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock): @@ -299,16 +293,10 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer): layer_idx = extract_layer_index(prefix) mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers) - if vllm_version_is("0.11.0"): - self.use_aclgraph = (vllm_config is not None - and vllm_config.compilation_config.level - == CompilationLevel.PIECEWISE and - not vllm_config.model_config.enforce_eager) - else: - self.use_aclgraph = (vllm_config is not None - and vllm_config.compilation_config.mode - == CompilationMode.VLLM_COMPILE and - not vllm_config.model_config.enforce_eager) + self.use_aclgraph = (vllm_config is not None + and vllm_config.compilation_config.mode + == CompilationMode.VLLM_COMPILE + and not vllm_config.model_config.enforce_eager) if (layer_idx not in mlp_only_layers) and ( config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0): diff --git a/vllm_ascend/torchair/models/torchair_deepseek_v2.py b/vllm_ascend/torchair/models/torchair_deepseek_v2.py index f67a0ff0..06db8132 100644 --- a/vllm_ascend/torchair/models/torchair_deepseek_v2.py +++ b/vllm_ascend/torchair/models/torchair_deepseek_v2.py @@ -32,6 +32,7 @@ import torch_npu from torch import nn from transformers import PretrainedConfig from vllm.attention import AttentionMetadata +from vllm.attention.layer import MLAAttention from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -74,12 +75,7 @@ from vllm_ascend.quantization.quant_config import AscendLinearMethod from vllm_ascend.torchair.ops.torchair_fused_moe import TorchairAscendFusedMoE from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import \ TorchairAscendW8A8DynamicLinearMethod -from vllm_ascend.utils import dispose_tensor, oproj_tp_enable, vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.attention import Attention -else: - from vllm.attention.layer import MLAAttention +from vllm_ascend.utils import dispose_tensor, oproj_tp_enable class Indexer(nn.Module): @@ -616,67 +612,31 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention): # k_c.size(1) + k_pe.size(1) == kv_cache.size(2) # i.e. # kv_lora_rank + qk_rope_head_dim == head_size - if vllm_version_is("0.11.0"): - self.mla_attn = Attention( - num_heads=self.num_local_heads, - head_size=self.kv_lora_rank + self.qk_rope_head_dim, - scale=self.scaling, - num_kv_heads=1, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - use_mla=True, - use_sparse=False, - indexer=None, - # SFA Args - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - qk_head_dim=self.qk_head_dim, - v_head_dim=self.v_head_dim, - rotary_emb=self.rotary_emb, - q_a_proj=self.q_a_proj - if self.q_lora_rank is not None else None, - q_a_layernorm=self.q_a_layernorm - if self.q_lora_rank is not None else None, - q_proj=self.q_proj - if self.q_lora_rank is None else self.q_b_proj, - kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, - kv_a_layernorm=self.kv_a_layernorm, - kv_b_proj=self.kv_b_proj, - o_proj=self.o_proj, - decoder_layer=decoder_layer, - ) - else: - self.mla_attn = MLAAttention( - num_heads=self.num_local_heads, - scale=self.scaling, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - v_head_dim=self.v_head_dim, - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - use_sparse=False, - indexer=None, - # MLA Args - rotary_emb=self.rotary_emb, - q_a_proj=self.q_a_proj - if self.q_lora_rank is not None else None, - q_a_layernorm=self.q_a_layernorm - if self.q_lora_rank is not None else None, - q_proj=self.q_proj - if self.q_lora_rank is None else self.q_b_proj, - q_b_proj=self.q_b_proj - if self.q_lora_rank is not None else None, - kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, - kv_a_layernorm=self.kv_a_layernorm, - kv_b_proj=self.kv_b_proj, - o_proj=self.o_proj, - ) + self.mla_attn = MLAAttention( + num_heads=self.num_local_heads, + scale=self.scaling, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + v_head_dim=self.v_head_dim, + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + use_sparse=False, + indexer=None, + # MLA Args + rotary_emb=self.rotary_emb, + q_a_proj=self.q_a_proj if self.q_lora_rank is not None else None, + q_a_layernorm=self.q_a_layernorm + if self.q_lora_rank is not None else None, + q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj, + q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None, + kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, + kv_a_layernorm=self.kv_a_layernorm, + kv_b_proj=self.kv_b_proj, + o_proj=self.o_proj, + ) def forward( self, @@ -882,66 +842,30 @@ class TorchairDeepseekV2SFAAttention(DeepseekV2MLAAttention): index_topk=self.index_topk, prefix=f"{prefix}.indexer", ) - - if vllm_version_is("0.11.0"): - self.sfa_attn = Attention( - num_heads=self.num_local_heads, - head_size=self.kv_lora_rank + self.qk_rope_head_dim, - scale=self.scaling, - num_kv_heads=1, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - use_mla=True, - use_sparse=True, - indexer=self.indexer, - # SFA Args - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - qk_head_dim=self.qk_head_dim, - v_head_dim=self.v_head_dim, - rotary_emb=self.rotary_emb, - q_a_proj=self.q_a_proj - if self.q_lora_rank is not None else None, - q_a_layernorm=self.q_a_layernorm - if self.q_lora_rank is not None else None, - q_proj=self.q_proj - if self.q_lora_rank is None else self.q_b_proj, - kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, - kv_a_layernorm=self.kv_a_layernorm, - kv_b_proj=self.kv_b_proj, - o_proj=self.o_proj, - decoder_layer=decoder_layer, - ) - else: - self.sfa_attn = MLAAttention( - num_heads=self.num_local_heads, - scale=self.scaling, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - v_head_dim=self.v_head_dim, - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - use_sparse=True, - indexer=self.indexer, - # MLA Args - rotary_emb=self.rotary_emb, - q_a_proj=self.q_a_proj - if self.q_lora_rank is not None else None, - q_a_layernorm=self.q_a_layernorm - if self.q_lora_rank is not None else None, - q_proj=self.q_proj - if self.q_lora_rank is None else self.q_b_proj, - kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, - kv_a_layernorm=self.kv_a_layernorm, - kv_b_proj=self.kv_b_proj, - o_proj=self.o_proj, - ) + self.sfa_attn = MLAAttention( + num_heads=self.num_local_heads, + scale=self.scaling, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + v_head_dim=self.v_head_dim, + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + use_sparse=True, + indexer=self.indexer, + # MLA Args + rotary_emb=self.rotary_emb, + q_a_proj=self.q_a_proj if self.q_lora_rank is not None else None, + q_a_layernorm=self.q_a_layernorm + if self.q_lora_rank is not None else None, + q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj, + kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, + kv_a_layernorm=self.kv_a_layernorm, + kv_b_proj=self.kv_b_proj, + o_proj=self.o_proj, + ) def forward( self, diff --git a/vllm_ascend/torchair/ops/torchair_fused_moe.py b/vllm_ascend/torchair/ops/torchair_fused_moe.py index 415aeb62..a262e284 100644 --- a/vllm_ascend/torchair/ops/torchair_fused_moe.py +++ b/vllm_ascend/torchair/ops/torchair_fused_moe.py @@ -53,8 +53,7 @@ from vllm_ascend.torchair.utils import (get_all_reduce_merge_state, super_kernel) from vllm_ascend.utils import (AscendSocVersion, dispose_tensor, get_ascend_soc_version, is_310p, - is_hierarchical_communication_enabled, - vllm_version_is) + is_hierarchical_communication_enabled) def torchair_fused_experts_with_mc2( @@ -1069,12 +1068,8 @@ class TorchairAscendFusedMoE(FusedMoE): get_compressed_expert_map(self.expert_map)) else: # init moe. - if vllm_version_is("0.11.0"): - self.local_num_experts, self.expert_map = determine_expert_map( - self.ep_size, self.ep_rank, self.global_num_experts) - else: - self.local_num_experts, self.expert_map, _ = determine_expert_map( - self.ep_size, self.ep_rank, self.global_num_experts) + self.local_num_experts, self.expert_map, _ = determine_expert_map( + self.ep_size, self.ep_rank, self.global_num_experts) # dynamic eplb initializing with not expert_map_path if self.dynamic_eplb: self.log2phy = determine_default_log2phy_map( diff --git a/vllm_ascend/torchair/torchair_attention.py b/vllm_ascend/torchair/torchair_attention.py index c3836200..8edf9fb3 100644 --- a/vllm_ascend/torchair/torchair_attention.py +++ b/vllm_ascend/torchair/torchair_attention.py @@ -26,13 +26,7 @@ from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer, AttentionType) from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.config import VllmConfig - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import cdiv -else: - from vllm.utils.math_utils import cdiv +from vllm.utils.math_utils import cdiv from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend, AscendAttentionMetadataBuilder, diff --git a/vllm_ascend/torchair/torchair_mla.py b/vllm_ascend/torchair/torchair_mla.py index 8cbd6603..014c08a9 100644 --- a/vllm_ascend/torchair/torchair_mla.py +++ b/vllm_ascend/torchair/torchair_mla.py @@ -12,13 +12,7 @@ from vllm.config import VllmConfig, get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import cdiv, round_down -else: - from vllm.utils.math_utils import cdiv, round_down +from vllm.utils.math_utils import cdiv, round_down import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config diff --git a/vllm_ascend/torchair/torchair_mtp_proposer.py b/vllm_ascend/torchair/torchair_mtp_proposer.py index c26b8dd4..183e0da2 100644 --- a/vllm_ascend/torchair/torchair_mtp_proposer.py +++ b/vllm_ascend/torchair/torchair_mtp_proposer.py @@ -11,6 +11,7 @@ from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.model_loader import get_model_loader from vllm.model_executor.model_loader.utils import \ process_weights_after_loading +from vllm.utils.torch_utils import set_default_torch_dtype from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.spec_decode.metadata import SpecDecodeMetadata @@ -23,13 +24,7 @@ from vllm_ascend.torchair.models.torchair_deepseek_mtp import \ TorchairDeepSeekMTP from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR, TorchairCommonAttentionMetadata) -from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable, - vllm_version_is) - -if vllm_version_is("0.11.0"): - from vllm.model_executor.model_loader.utils import set_default_torch_dtype -else: - from vllm.utils.torch_utils import set_default_torch_dtype +from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable PADDING_SLOT_ID = -1 diff --git a/vllm_ascend/torchair/torchair_sfa.py b/vllm_ascend/torchair/torchair_sfa.py index cfa71209..fdaab404 100644 --- a/vllm_ascend/torchair/torchair_sfa.py +++ b/vllm_ascend/torchair/torchair_sfa.py @@ -12,13 +12,7 @@ from vllm.config import VllmConfig, get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import cdiv, round_down -else: - from vllm.utils.math_utils import cdiv, round_down +from vllm.utils.math_utils import cdiv, round_down import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 2235cc07..a539c5a3 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -412,33 +412,21 @@ def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool: Check whether it is vLLM default capture sizes. """ - if vllm_version_is("0.11.0"): - cuda_graph_sizes = vllm_config.scheduler_config.cuda_graph_sizes - if len(cuda_graph_sizes) == 1: - cudagraph_capture_sizes = [1, 2, 4] + [ - i for i in range(8, cuda_graph_sizes[0] + 1, 8) - ] - else: - max_cudagraph_capture_size = \ - vllm_config.compilation_config.max_cudagraph_capture_size - cudagraph_capture_sizes = [ - i for i in [1, 2, 4] if i <= max_cudagraph_capture_size - ] - if max_cudagraph_capture_size >= 8: - # Step size 8 for small batch sizes, up to 256(not included) - cudagraph_capture_sizes += list( - range(8, min(max_cudagraph_capture_size + 1, 256), 8)) - if max_cudagraph_capture_size >= 256: - # Step size 16 for larger batch sizes - cudagraph_capture_sizes += list( - range(256, max_cudagraph_capture_size + 1, 16)) - - if vllm_version_is("0.11.0"): - target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes, - reverse=True) - else: - # in newer version, vVLLM use ascending order of cudagraph_capture_sizes. - target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes) + max_cudagraph_capture_size = \ + vllm_config.compilation_config.max_cudagraph_capture_size + cudagraph_capture_sizes = [ + i for i in [1, 2, 4] if i <= max_cudagraph_capture_size + ] + if max_cudagraph_capture_size >= 8: + # Step size 8 for small batch sizes, up to 256(not included) + cudagraph_capture_sizes += list( + range(8, min(max_cudagraph_capture_size + 1, 256), 8)) + if max_cudagraph_capture_size >= 256: + # Step size 16 for larger batch sizes + cudagraph_capture_sizes += list( + range(256, max_cudagraph_capture_size + 1, 16)) + # in newer version, vLLM use ascending order of cudagraph_capture_sizes. + target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes) if target_cudagraph_capture_sizes == \ vllm_config.compilation_config.cudagraph_capture_sizes: return True @@ -465,21 +453,13 @@ def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None: if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \ and vllm_config.parallel_config.tensor_parallel_size == 1 \ and vllm_config.parallel_config.data_parallel_size > 1 : - if vllm_version_is("0.11.0"): - max_capture_size = vllm_config.scheduler_config.cuda_graph_sizes[0] - else: - max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size + + max_capture_size = vllm_config.compilation_config.max_cudagraph_capture_size new_cudagraph_capture_sizes = [1, 2, 5, 10, 15, 20] + [ i for i in range(24, max_capture_size + 1, 8) ] - - if vllm_version_is("0.11.0"): - vllm_config.compilation_config.cudagraph_capture_sizes = new_cudagraph_capture_sizes - vllm_config.compilation_config.init_with_cudagraph_sizes( - new_cudagraph_capture_sizes) - else: - update_cudagraph_capture_sizes(vllm_config, - new_cudagraph_capture_sizes) + update_cudagraph_capture_sizes(vllm_config, + new_cudagraph_capture_sizes) def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: @@ -573,10 +553,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: indices[0], indices[-1] = 0, len(original_sizes) - 1 sampled_sizes = [original_sizes[i] for i in indices] - if vllm_version_is("0.11.0"): - compilation_config.init_with_cudagraph_sizes(sampled_sizes) - else: - update_cudagraph_capture_sizes(vllm_config, sampled_sizes) + update_cudagraph_capture_sizes(vllm_config, sampled_sizes) logger.info( "Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes", @@ -607,10 +584,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs: enlarged_sizes = [(num_speculative_tokens + 1) * size for size in original_sizes] - if vllm_version_is("0.11.0"): - compilation_config.init_with_cudagraph_sizes(enlarged_sizes) - else: - update_cudagraph_capture_sizes(vllm_config, enlarged_sizes) + update_cudagraph_capture_sizes(vllm_config, enlarged_sizes) logger.info( "Adjusted ACL graphs: %s → %s for speculative decoding", original_sizes, enlarged_sizes) @@ -719,11 +693,8 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None): "GemmaRMSNorm": AscendGemmaRMSNorm, "FusedMoE": AscendFusedMoE, "SharedFusedMoE": AscendSharedFusedMoE, + "MultiHeadLatentAttentionWrapper": AscendMultiHeadLatentAttention, } - mla_to_register = "MultiHeadLatentAttention" if vllm_version_is( - "0.11.0") else "MultiHeadLatentAttentionWrapper" - if vllm_config and vllm_config.model_config and vllm_config.model_config.use_mla: - REGISTERED_ASCEND_OPS[mla_to_register] = AscendMultiHeadLatentAttention for name, op_cls in REGISTERED_ASCEND_OPS.items(): CustomOp.register_oot(_decorated_op_cls=op_cls, name=name) diff --git a/vllm_ascend/worker/block_table.py b/vllm_ascend/worker/block_table.py index 579a051a..3317a237 100644 --- a/vllm_ascend/worker/block_table.py +++ b/vllm_ascend/worker/block_table.py @@ -3,13 +3,7 @@ from typing import Optional, Union import numpy as np import torch from vllm.distributed import get_dcp_group - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import cdiv -else: - from vllm.utils.math_utils import cdiv +from vllm.utils.math_utils import cdiv from vllm_ascend.utils import prefill_context_parallel_enable diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index c91abc54..124102f5 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -41,10 +41,11 @@ import torch.nn as nn from tqdm import tqdm # type: ignore from vllm.attention import AttentionType, get_attn_backend from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.layer import Attention +from vllm.attention.layer import Attention, MLAAttention from vllm.compilation.counter import compilation_counter from vllm.compilation.monitor import set_cudagraph_capturing_enabled -from vllm.config import CUDAGraphMode, VllmConfig, get_layers_from_vllm_config +from vllm.config import (CompilationMode, CUDAGraphMode, VllmConfig, + get_layers_from_vllm_config) from vllm.distributed import tensor_model_parallel_all_gather from vllm.distributed.kv_transfer import (get_kv_transfer_group, has_kv_transfer_group) @@ -58,8 +59,6 @@ from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.mamba.abstract import MambaBase from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import get_model -# yapf conflicts with isort for this block -# yapf: disable from vllm.model_executor.models.interfaces import (SupportsMultiModal, supports_mrope, supports_transcription) @@ -73,29 +72,23 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import length_from_prompt_token_ids_or_embeds - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - from vllm.utils import cdiv -else: - from vllm.utils.math_utils import cdiv - +from vllm.utils.import_utils import LazyLoader from vllm.utils.jsontree import json_map_leaves +from vllm.utils.math_utils import cdiv +from vllm.utils.mem_utils import DeviceMemoryProfiler +from vllm.utils.platform_utils import is_pin_memory_available +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( AttentionCGSupport, CommonAttentionMetadata, reorder_batch_to_split_decodes_and_prefills) from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher -# yapf conflicts with isort for this block -# yapf: disable from vllm.v1.kv_cache_interface import (AttentionSpec, EncoderOnlyAttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, MambaSpec, MLAAttentionSpec, UniformTypeKVCacheSpecs) -# yapf: enable from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, DraftTokenIds, LogprobsTensors, ModelRunnerOutput, PoolerOutput) @@ -119,6 +112,7 @@ from vllm_ascend.attention.attention_mask import AttentionMaskBuilder from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata, AscendPrefillContextParallelMetadata) +# yapf conflicts with isort for this block # yapf: disable from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper, set_graph_params, @@ -147,8 +141,7 @@ from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, AscendSocVersion, ProfileExecuteDuration, enable_sp, get_ascend_soc_version, is_310p, is_enable_nz, is_moe_model, lmhead_tp_enable, - prefill_context_parallel_enable, - vllm_version_is) + prefill_context_parallel_enable) from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch if prefill_context_parallel_enable(): @@ -157,27 +150,6 @@ if prefill_context_parallel_enable(): get_prefill_context_model_parallel_rank, get_prefill_context_model_parallel_world_size) -if vllm_version_is("0.11.0"): - from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - get_dtype_size) -else: - from vllm.utils.mem_utils import DeviceMemoryProfiler - from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size - -# yapf: enable - -if vllm_version_is("0.11.0"): - from vllm.attention.layer import Attention - from vllm.config import CompilationLevel - from vllm.utils import LazyLoader, is_pin_memory_available - - from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention -else: - from vllm.attention.layer import MLAAttention - from vllm.config import CompilationMode - from vllm.utils.import_utils import LazyLoader - from vllm.utils.platform_utils import is_pin_memory_available - if TYPE_CHECKING: import xgrammar as xgr # type: ignore[import-untyped] from vllm.v1.core.sched.output import SchedulerOutput @@ -637,11 +609,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): diagonal=1).to(self.device) if get_pp_group().is_last_rank: self.drafter = self._get_drafter() - if vllm_version_is("0.11.0"): - self.rejection_sampler = AscendRejectionSampler() - else: - self.rejection_sampler = AscendRejectionSampler( - self.sampler) + self.rejection_sampler = AscendRejectionSampler(self.sampler) self.actual_seq_lengths_q = list( range(self.decode_token_per_req, self.max_num_tokens + 1, self.decode_token_per_req)) @@ -664,11 +632,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): # tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes, # the max number of tokens in graph is min(max_num_seqs * uniform_decode_query_len, 512). if self.compilation_config.cudagraph_capture_sizes: - if vllm_version_is("0.11.0"): - max_num_tokens = self.compilation_config.cudagraph_capture_sizes[ - 0] - else: - max_num_tokens = self.compilation_config.max_cudagraph_capture_size + max_num_tokens = self.compilation_config.max_cudagraph_capture_size else: # NOTE: To save memory, we cap the max number of tokens to 512. max_num_tokens = min( @@ -717,10 +681,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): self.input_batch.num_accepted_tokens_cpu[i] = num_tokens def _use_aclgraph(self) -> bool: - if vllm_version_is("0.11.0"): - return self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.level == CompilationLevel.PIECEWISE and not self.model_config.enforce_eager - else: - return self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.mode == CompilationMode.VLLM_COMPILE and not self.model_config.enforce_eager + return self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.mode == CompilationMode.VLLM_COMPILE and not self.model_config.enforce_eager def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Remove finished requests from the cached states. @@ -914,9 +875,9 @@ class NPUModelRunner(LoRAModelRunnerMixin): if mm_input.get("use_audio_in_video") is True: use_audio_in_video = True - if vllm_version_is("0.11.0"): + if supports_mrope(self.model): req_state.mrope_positions, req_state.mrope_position_delta = \ - MRotaryEmbedding.get_input_positions_tensor( + self.model.get_mrope_input_positions( req_state.prompt_token_ids, hf_config=self.model_config.hf_config, image_grid_thw=image_grid_thw, @@ -925,18 +886,6 @@ class NPUModelRunner(LoRAModelRunnerMixin): audio_feature_lengths=audio_feature_lengths, use_audio_in_video=use_audio_in_video, ) - else: - if supports_mrope(self.model): - req_state.mrope_positions, req_state.mrope_position_delta = \ - self.model.get_mrope_input_positions( - req_state.prompt_token_ids, - hf_config=self.model_config.hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) def _sync_metadata_across_dp( self, num_tokens: int, @@ -1108,21 +1057,13 @@ class NPUModelRunner(LoRAModelRunnerMixin): mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler( scheduler_output) encoder_outputs = [] - - if vllm_version_is("0.11.0"): - mm_inputs = group_mm_kwargs_by_modality( - mm_kwargs, - device=self.device, - pin_memory=self.pin_memory, - ) - else: - model = cast(SupportsMultiModal, self.model) - mm_inputs = group_mm_kwargs_by_modality( - mm_kwargs, - device=self.device, - pin_memory=self.pin_memory, - merge_by_field_config=model.merge_by_field_config, - ) + model = cast(SupportsMultiModal, self.model) + mm_inputs = group_mm_kwargs_by_modality( + mm_kwargs, + device=self.device, + pin_memory=self.pin_memory, + merge_by_field_config=model.merge_by_field_config, + ) for modality, num_items, mm_kwargs_group in mm_inputs: # Run the encoder. # `curr_group_outputs` is either of the following: @@ -1181,56 +1122,6 @@ class NPUModelRunner(LoRAModelRunnerMixin): return mm_kwargs, mm_hashes_pos - def _gather_mm_embeddings_0110( - self, - scheduler_output: "SchedulerOutput", - ) -> list[torch.Tensor]: - - def _iter_mm_features(req_state: CachedRequestState): - assert req_state.mm_features is not None - for mm_feature in req_state.mm_features: - pos_info = mm_feature.mm_position - yield mm_feature.identifier, pos_info, getattr( - pos_info, "is_embed", None) - - mm_embeds: list[torch.Tensor] = [] - - for req_id in self.input_batch.req_ids: - num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ - req_id] - req_state = self.requests[req_id] - num_computed_tokens = req_state.num_computed_tokens - - for mm_hash, pos_info, is_embed in _iter_mm_features(req_state): - start_pos = pos_info.offset - num_encoder_tokens = pos_info.length - - if start_pos >= num_computed_tokens + num_scheduled_tokens: - break - if start_pos + num_encoder_tokens <= num_computed_tokens: - continue - - start_idx = max(num_computed_tokens - start_pos, 0) - end_idx = min( - num_computed_tokens - start_pos + num_scheduled_tokens, - num_encoder_tokens, - ) - assert start_idx < end_idx - - encoder_output = self.encoder_cache.get(mm_hash, None) - assert encoder_output is not None, \ - f"Encoder cache miss for {mm_hash}." - - if is_embed is not None: - is_embed = is_embed[start_idx:end_idx] - - mm_embeds_item = gather_mm_placeholders( - encoder_output[start_idx:end_idx], - is_embed=is_embed, - ) - mm_embeds.append(mm_embeds_item) - return mm_embeds - def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", @@ -1730,22 +1621,14 @@ class NPUModelRunner(LoRAModelRunnerMixin): # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. input_ids = self.input_ids[:total_num_scheduled_tokens] - if vllm_version_is("0.11.0"): - mm_embeds = self._gather_mm_embeddings_0110(scheduler_output) - if mm_embeds: - inputs_embeds = self.model.get_input_embeddings( - input_ids, mm_embeds) - else: - inputs_embeds = self.model.get_input_embeddings(input_ids) - else: - mm_embeds, is_mm_embed = self._gather_mm_embeddings( - scheduler_output) + mm_embeds, is_mm_embed = self._gather_mm_embeddings( + scheduler_output) - inputs_embeds = self.model.get_input_embeddings( - input_ids, - multimodal_embeddings=mm_embeds, - is_multimodal=is_mm_embed, - ) + inputs_embeds = self.model.get_input_embeddings( + input_ids, + multimodal_embeddings=mm_embeds, + is_multimodal=is_mm_embed, + ) # TODO(woosuk): Avoid the copy. Optimize. self.inputs_embeds.gpu[:total_num_scheduled_tokens].copy_( @@ -2151,9 +2034,8 @@ class NPUModelRunner(LoRAModelRunnerMixin): # TODO: Optimize the CPU -> NPU copy. cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to( self.device, non_blocking=True) - if not vllm_version_is("0.11.0"): - cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to( - self.device, non_blocking=True) + cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to( + self.device, non_blocking=True) logits_indices = torch.from_numpy(logits_indices).to(self.device, non_blocking=True) target_logits_indices = torch.from_numpy(target_logits_indices).to( @@ -2167,25 +2049,15 @@ class NPUModelRunner(LoRAModelRunnerMixin): draft_token_ids = draft_token_ids[target_logits_indices + 1] if self.pcp_size > 1: logits_indices = logits_indices_pcp - if vllm_version_is("0.11.0"): - metadata = SpecDecodeMetadata( - draft_token_ids=draft_token_ids, - num_draft_tokens=num_draft_tokens.tolist(), - cu_num_draft_tokens=cu_num_draft_tokens, - target_logits_indices=target_logits_indices, - bonus_logits_indices=bonus_logits_indices, - logits_indices=logits_indices, - ) - else: - metadata = SpecDecodeMetadata( - draft_token_ids=draft_token_ids, - num_draft_tokens=num_draft_tokens.tolist(), - cu_num_draft_tokens=cu_num_draft_tokens, - cu_num_sampled_tokens=cu_num_sampled_tokens, - target_logits_indices=target_logits_indices, - bonus_logits_indices=bonus_logits_indices, - logits_indices=logits_indices, - ) + metadata = SpecDecodeMetadata( + draft_token_ids=draft_token_ids, + num_draft_tokens=num_draft_tokens.tolist(), + cu_num_draft_tokens=cu_num_draft_tokens, + cu_num_sampled_tokens=cu_num_sampled_tokens, + target_logits_indices=target_logits_indices, + bonus_logits_indices=bonus_logits_indices, + logits_indices=logits_indices, + ) return metadata def apply_grammar_bitmask( @@ -2222,33 +2094,16 @@ class NPUModelRunner(LoRAModelRunnerMixin): shape=(logits.shape[0], grammar_bitmask.shape[1])) cumulative_index = 0 - if vllm_version_is("0.11.0"): - seq = sorted( - scheduler_output.structured_output_request_ids.items(), - key=lambda x: x[1]) - for req_id, _ in seq: + for req_id in scheduler_output.structured_output_request_ids: + num_spec_tokens = len( + scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) + if req_id in struct_out_req_batch_indices: logit_index = struct_out_req_batch_indices[req_id] - num_spec_tokens = len( - scheduler_output.scheduled_spec_decode_tokens.get( - req_id, [])) for i in range(1 + num_spec_tokens): - sorted_bitmask[logit_index + i] = \ - grammar_bitmask[cumulative_index + i] + sorted_bitmask[logit_index + + i] = grammar_bitmask[cumulative_index + i] out_indices.append(logit_index + i) - cumulative_index += 1 + num_spec_tokens - else: - for req_id in scheduler_output.structured_output_request_ids: - num_spec_tokens = len( - scheduler_output.scheduled_spec_decode_tokens.get( - req_id, [])) - if req_id in struct_out_req_batch_indices: - logit_index = struct_out_req_batch_indices[req_id] - for i in range(1 + num_spec_tokens): - sorted_bitmask[logit_index + - i] = grammar_bitmask[cumulative_index + - i] - out_indices.append(logit_index + i) - cumulative_index += 1 + num_spec_tokens + cumulative_index += 1 + num_spec_tokens grammar_bitmask = sorted_bitmask # Serialization of np.ndarray is much more efficient than a tensor, @@ -2518,14 +2373,8 @@ class NPUModelRunner(LoRAModelRunnerMixin): logits = model_output_broadcast_data["logits"] # Apply structured output bitmasks if present - if vllm_version_is("0.11.0"): - if scheduler_output.grammar_bitmask is not None: - logits = self.apply_grammar_bitmask( - scheduler_output, logits) - else: - if scheduler_output.structured_output_request_ids: - logits = self.apply_grammar_bitmask( - scheduler_output, logits) + if scheduler_output.structured_output_request_ids: + logits = self.apply_grammar_bitmask(scheduler_output, logits) with ProfileExecuteDuration().capture_async("Sample"): # Sample the next token and get logprobs if needed. @@ -3837,95 +3686,6 @@ class NPUModelRunner(LoRAModelRunnerMixin): else: self.reorder_batch_threshold = reorder_batch_threshold_i - def get_kv_cache_spec_v0110(self) -> dict[str, KVCacheSpec]: - """ - Generates the KVCacheSpec by parsing the kv cache format from each - Attention module in the static forward context. - Returns: - KVCacheSpec: A dictionary mapping layer names to their KV cache - format. Layers that do not need KV cache are not included. - """ - - block_size = self.vllm_config.cache_config.block_size - use_mla = self.vllm_config.model_config.use_mla - use_sparse = self.use_sparse - kv_cache_spec: dict[str, KVCacheSpec] = {} - attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention) - for layer_name, attn_module in attn_layers.items(): - if (kv_tgt_layer := - attn_module.kv_sharing_target_layer_name) is not None: - # The layer doesn't need its own KV cache and will use that of - # the target layer. We skip creating a KVCacheSpec for it, so - # that KV cache management logic will act as this layer does - # not exist, and doesn't allocate KV cache for the layer. This - # enables the memory saving of cross-layer kv sharing, allowing - # a given amount of memory to accommodate longer context lengths - # or enable more requests to be processed simultaneously. - self.shared_kv_cache_layers[layer_name] = kv_tgt_layer - continue - if isinstance(attn_module, AscendMultiHeadLatentAttention): - continue - - # TODO: Support other attention modules, e.g., cross-attention - # TODO(lucas): move the attention specs into the model layers like - # the attention backends - if attn_module.attn_type == AttentionType.DECODER: - if use_mla and not use_sparse: - kv_cache_spec[layer_name] = MLAAttentionSpec( - block_size=block_size, - num_kv_heads=attn_module.num_kv_heads, - head_size=attn_module.head_size, - dtype=self.kv_cache_dtype, - cache_dtype_str=self.cache_config.cache_dtype) - else: - # TODO(cmq): This is a hack way to fix deepseek kvcache when - # using DSA. Fix the spec in vLLM is a finnal way. - kv_cache_spec[layer_name] = FullAttentionSpec( - block_size=block_size, - num_kv_heads=attn_module.num_kv_heads, - head_size=attn_module.head_size, - dtype=self.kv_cache_dtype) - elif attn_module.attn_type in (AttentionType.ENCODER, - AttentionType.ENCODER_ONLY): - # encoder-only attention does not need KV cache. - continue - elif attn_module.attn_type == AttentionType.ENCODER_DECODER: - raise NotImplementedError - else: - raise ValueError( - f"Unknown attention type: {attn_module.attn_type}") - - mamba_layers = get_layers_from_vllm_config(self.vllm_config, MambaBase) - if len(mamba_layers) > 0: - if (self.vllm_config.speculative_config is not None - and self.vllm_config.model_config.hf_config.model_type - not in ["qwen3_next"]): - raise NotImplementedError( - "Mamba with speculative decoding is not supported yet.") - if self.vllm_config.cache_config.enable_prefix_caching: - raise NotImplementedError( - "Prefix caching is not supported for Mamba yet.") - max_model_len = self.vllm_config.model_config.max_model_len - - page_size_padded = ( - self.vllm_config.cache_config.mamba_page_size_padded) - - # Set block_size to max_model_len, so that mamba model will always - # have only one block in the KV cache. - for layer_name, mamba_module in mamba_layers.items(): - kv_cache_spec[layer_name] = MambaSpec( - shapes=mamba_module.get_state_shape(), - dtypes=mamba_module.get_state_dtype(), - block_size=max_model_len, - page_size_padded=page_size_padded, - mamba_type=mamba_module.mamba_type, - num_speculative_blocks=( - self.speculative_config.num_speculative_tokens - if self.speculative_config else 0), - ) - - return kv_cache_spec - def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: """ Generates the KVCacheSpec by parsing the kv cache format from each @@ -3934,9 +3694,6 @@ class NPUModelRunner(LoRAModelRunnerMixin): KVCacheSpec: A dictionary mapping layer names to their KV cache format. Layers that do not need KV cache are not included. """ - if vllm_version_is("0.11.0"): - return self.get_kv_cache_spec_v0110() - block_size = self.vllm_config.cache_config.block_size use_mla = self.vllm_config.model_config.use_mla kv_cache_spec: dict[str, KVCacheSpec] = {} diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py index 846a4b29..c41433b5 100644 --- a/vllm_ascend/worker/npu_input_batch.py +++ b/vllm_ascend/worker/npu_input_batch.py @@ -30,6 +30,7 @@ from vllm.multimodal.inputs import (MultiModalFeatureSpec, from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import length_from_prompt_token_ids_or_embeds +from vllm.utils.collection_utils import swap_dict_values from vllm.v1.outputs import LogprobsTensors from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import (BatchUpdateBuilder, @@ -39,14 +40,8 @@ from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.spec_decode.utils import is_spec_decode_unsupported from vllm.v1.utils import copy_slice -from vllm_ascend.utils import vllm_version_is from vllm_ascend.worker.block_table import MultiGroupBlockTable -if vllm_version_is("0.11.0"): - from vllm.utils import swap_dict_values -else: - from vllm.utils.collection_utils import swap_dict_values - @dataclass class CachedRequestState: diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index db50bceb..c4124c51 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -35,6 +35,8 @@ from vllm.logger import logger from vllm.lora.request import LoRARequest from vllm.sequence import IntermediateTensors from vllm.tasks import SupportedTask +from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, @@ -50,7 +52,7 @@ from vllm_ascend.platform import NPUPlatform from vllm_ascend.utils import (init_ascend_soc_version, is_enable_nz, prefill_context_parallel_enable, register_ascend_customop, sleep_mode_enabled, - try_register_lib, vllm_version_is) + try_register_lib) from vllm_ascend.worker.model_runner_v1 import NPUModelRunner torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402 @@ -65,12 +67,6 @@ torch_non_c_binding_in_graph_functions_npu[ torch._dynamo.trace_rules.torch_name_rule_map.append( torch_non_c_binding_in_graph_functions_npu) # noqa: E402 -if vllm_version_is("0.11.0"): - from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes -else: - from vllm.utils.mem_constants import GiB_bytes - from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE - class NPUWorker(WorkerBase): @@ -141,10 +137,7 @@ class NPUWorker(WorkerBase): if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing - if vllm_version_is("0.11.0"): - from vllm.utils import init_cached_hf_modules - else: - from vllm.utils.import_utils import init_cached_hf_modules + from vllm.utils.import_utils import init_cached_hf_modules init_cached_hf_modules()