From e4458b2d2bc292b82af2387ec25442e942323ee0 Mon Sep 17 00:00:00 2001 From: Canlin Guo Date: Fri, 27 Feb 2026 16:05:21 +0800 Subject: [PATCH] [Main2Main] Upgrade vLLM to 0226 (#6813) ### What this PR does / why we need it? Breaking: 1. https://github.com/vllm-project/vllm/pull/33452 2. https://github.com/vllm-project/vllm/pull/33451 3. https://github.com/vllm-project/vllm/pull/32567 4. https://github.com/vllm-project/vllm/pull/32344 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/83b47f67b1dfad505606070ae4d9f83e50ad4ebd --------- Signed-off-by: MrZ20 <2609716663@qq.com> Signed-off-by: gcanlin Co-authored-by: MrZ20 <2609716663@qq.com> --- .../workflows/_e2e_nightly_multi_node.yaml | 2 +- .github/workflows/bot_pr_create.yaml | 2 +- .github/workflows/dockerfiles/Dockerfile.lint | 2 +- .github/workflows/pr_test_full.yaml | 2 +- .github/workflows/pr_test_light.yaml | 6 +-- .../workflows/schedule_codecov_refresh.yaml | 2 +- .../workflows/schedule_nightly_test_a2.yaml | 2 +- .../workflows/schedule_test_benchmarks.yaml | 2 +- Dockerfile | 2 +- Dockerfile.310p | 2 +- Dockerfile.310p.openEuler | 2 +- Dockerfile.a3 | 2 +- Dockerfile.a3.openEuler | 2 +- Dockerfile.openEuler | 2 +- docs/source/community/versioning_policy.md | 2 +- docs/source/conf.py | 2 +- tests/e2e/singlecard/compile/backend.py | 7 +--- .../compile/test_norm_quant_fusion.py | 9 +---- tests/ut/eplb/core/test_eplb_utils.py | 4 +- .../ut/quantization/test_modelslim_config.py | 5 +-- vllm_ascend/ascend_forward_context.py | 5 --- .../compilation/graph_fusion_pass_manager.py | 11 +---- .../passes/allreduce_rmsnorm_fusion_pass.py | 10 +---- .../passes/norm_quant_fusion_pass.py | 8 +--- .../passes/qknorm_rope_fusion_pass.py | 10 +---- .../cpu_offload/cpu_offload_connector.py | 6 +-- vllm_ascend/kv_offload/cpu_npu.py | 40 +++++++------------ vllm_ascend/ops/fused_moe/fused_moe.py | 15 ++++--- vllm_ascend/ops/mla.py | 22 ++++------ vllm_ascend/patch/platform/__init__.py | 5 --- .../patch/worker/patch_qwen3_next_mtp.py | 8 +--- vllm_ascend/patch/worker/patch_v2_eagle.py | 14 ++++++- vllm_ascend/platform.py | 2 +- vllm_ascend/quantization/method_adapters.py | 12 ++++++ vllm_ascend/quantization/modelslim_config.py | 7 +--- vllm_ascend/spec_decode/eagle_proposer.py | 29 ++++++-------- vllm_ascend/spec_decode/mtp_proposer.py | 20 +++++----- vllm_ascend/utils.py | 7 +--- vllm_ascend/worker/model_runner_v1.py | 6 +-- vllm_ascend/worker/worker.py | 3 ++ 40 files changed, 117 insertions(+), 184 deletions(-) diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index 44db2b46..16d48b9c 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -32,7 +32,7 @@ on: description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need vllm_version: required: false - default: "v0.15.0" + default: "v0.16.0" type: string description: vllm version to use vllm_ascend_remote_url: diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index 51e52bed..bb956c2a 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd + VLLM_COMMIT=15d76f74e2fdb12a95ea00f0ca283acf6219a2b7 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index c86324ae..a5aee23d 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd +ARG VLLM_COMMIT=15d76f74e2fdb12a95ea00f0ca283acf6219a2b7 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \ git checkout $VLLM_COMMIT diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 2275bfb0..925106c9 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0] + vllm_version: [15d76f74e2fdb12a95ea00f0ca283acf6219a2b7, v0.16.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 40499534..938430fc 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 83b47f67b1dfad505606070ae4d9f83e50ad4ebd + vllm: 15d76f74e2fdb12a95ea00f0ca283acf6219a2b7 changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -87,7 +87,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0] + vllm_version: [15d76f74e2fdb12a95ea00f0ca283acf6219a2b7, v0.16.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -99,7 +99,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0] + vllm_version: [15d76f74e2fdb12a95ea00f0ca283acf6219a2b7, v0.16.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index ea7f97ea..e693cf7b 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd] + vllm_version: [15d76f74e2fdb12a95ea00f0ca283acf6219a2b7] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/.github/workflows/schedule_nightly_test_a2.yaml b/.github/workflows/schedule_nightly_test_a2.yaml index 9a011af6..3e555557 100644 --- a/.github/workflows/schedule_nightly_test_a2.yaml +++ b/.github/workflows/schedule_nightly_test_a2.yaml @@ -133,7 +133,7 @@ jobs: - Qwen3-Omni-30B-A3B-Instruct uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml with: - vllm: v0.15.0 + vllm: v0.16.0 runner: ${{ matrix.test_config.os }} model_list: ${{ toJson(matrix.test_config.model_list) }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-910b-ubuntu22.04-py3.11' diff --git a/.github/workflows/schedule_test_benchmarks.yaml b/.github/workflows/schedule_test_benchmarks.yaml index 4189ff9e..48ee1d14 100644 --- a/.github/workflows/schedule_test_benchmarks.yaml +++ b/.github/workflows/schedule_test_benchmarks.yaml @@ -51,7 +51,7 @@ jobs: strategy: matrix: include: - - vllm_branch: v0.15.0 + - vllm_branch: v0.16.0 vllm_ascend_branch: main max-parallel: 1 container: diff --git a/Dockerfile b/Dockerfile index a28ac7c2..b04ae9e1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,7 +50,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.15.0 +ARG VLLM_TAG=v0.16.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p b/Dockerfile.310p index e967d62b..ec5772a2 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -40,7 +40,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.15.0 +ARG VLLM_TAG=v0.16.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index b5d71af0..92f94199 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -36,7 +36,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.15.0 +ARG VLLM_TAG=v0.16.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.a3 b/Dockerfile.a3 index fd68662e..17bd077b 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -49,7 +49,7 @@ RUN apt-get update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.15.0 +ARG VLLM_TAG=v0.16.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index 1636322f..4e5b3838 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -50,7 +50,7 @@ RUN yum update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.15.0 +ARG VLLM_TAG=v0.16.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 6ff70377..825b8a24 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -50,7 +50,7 @@ RUN yum update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.15.0 +ARG VLLM_TAG=v0.16.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 821e9a53..c04969d4 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -57,7 +57,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | 83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | 4572a06afe96d0a6d5d3efacf130c71505dd2bc9, v0.16.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/docs/source/conf.py b/docs/source/conf.py index 6c24c588..35d442f1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -77,7 +77,7 @@ myst_substitutions = { # CANN image tag "cann_image_tag": "8.5.0-910b-ubuntu22.04-py3.11", # vllm version in ci - "ci_vllm_version": "v0.15.0", + "ci_vllm_version": "v0.16.0", } # For cross-file header anchors diff --git a/tests/e2e/singlecard/compile/backend.py b/tests/e2e/singlecard/compile/backend.py index e0fde30c..2866f7f0 100644 --- a/tests/e2e/singlecard/compile/backend.py +++ b/tests/e2e/singlecard/compile/backend.py @@ -20,15 +20,10 @@ from typing import Any import torch.fx as fx from torch._inductor.decomposition import select_decomp_table +from vllm.compilation.passes.fx_utils import OpOverload from vllm.config import get_current_vllm_config from vllm_ascend.compilation.compiler_interface import compile_fx -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.15.0"): - from vllm.compilation.fx_utils import OpOverload # type: ignore -else: - from vllm.compilation.passes.fx_utils import OpOverload class TestBackend: diff --git a/tests/e2e/singlecard/compile/test_norm_quant_fusion.py b/tests/e2e/singlecard/compile/test_norm_quant_fusion.py index 00b2b123..8e12d276 100644 --- a/tests/e2e/singlecard/compile/test_norm_quant_fusion.py +++ b/tests/e2e/singlecard/compile/test_norm_quant_fusion.py @@ -19,6 +19,7 @@ import pytest import torch import torch.nn as nn import vllm.config +from vllm.compilation.passes.fx_utils import OpOverload from vllm.config import ModelConfig, VllmConfig from vllm.distributed import ensure_model_parallel_initialized, init_distributed_environment from vllm.utils.system_utils import update_environment_variables @@ -27,13 +28,7 @@ import vllm_ascend.ops.register_custom_ops # noqa from tests.e2e.singlecard.compile.backend import TestBackend from vllm_ascend.ascend_forward_context import set_ascend_forward_context from vllm_ascend.compilation.passes.norm_quant_fusion_pass import AddRMSNormQuantFusionPass -from vllm_ascend.utils import enable_custom_op, vllm_version_is - -if vllm_version_is("0.15.0"): - from vllm.compilation.fx_utils import OpOverload # type: ignore -else: - from vllm.compilation.passes.fx_utils import OpOverload - +from vllm_ascend.utils import enable_custom_op # Cache backend to avoid duplicate pattern registration _backend_cache = None diff --git a/tests/ut/eplb/core/test_eplb_utils.py b/tests/ut/eplb/core/test_eplb_utils.py index df49283b..1265ddba 100644 --- a/tests/ut/eplb/core/test_eplb_utils.py +++ b/tests/ut/eplb/core/test_eplb_utils.py @@ -22,9 +22,9 @@ class TestAscendConfig(unittest.TestCase): "eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2}, } from vllm.model_executor.layers.fused_moe.config import RoutingMethodType - if vllm_version_is("0.15.0"): + if vllm_version_is("0.16.0"): moe_parallel_config = FusedMoEParallelConfig( - 2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True) + 2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", is_sequence_parallel=True, enable_eplb=True) moe_config = FusedMoEConfig( num_experts=8, experts_per_token=8, diff --git a/tests/ut/quantization/test_modelslim_config.py b/tests/ut/quantization/test_modelslim_config.py index d8dd1c6c..73497568 100644 --- a/tests/ut/quantization/test_modelslim_config.py +++ b/tests/ut/quantization/test_modelslim_config.py @@ -15,10 +15,7 @@ from vllm_ascend.quantization.modelslim_config import ( ) from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is -if vllm_version_is("v0.15.0"): - from vllm.attention.layer import Attention # type: ignore -else: - from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.attention import Attention class TestAscendModelSlimConfig(TestBase): diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index acaf79fe..e1384653 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -19,7 +19,6 @@ from vllm_ascend.utils import ( is_drafter_moe_model, is_moe_model, speculative_enable_dispatch_gmm_combine_decode, - vllm_version_is, ) @@ -152,10 +151,6 @@ def set_ascend_forward_context( mc2_mask[:num_actual_tokens] = True mc2_mask[num_actual_tokens:] = False forward_context.mc2_mask = mc2_mask - - if is_draft_model and vllm_version_is("0.15.0"): - forward_context.remaining_moe_layers = None - try: yield finally: diff --git a/vllm_ascend/compilation/graph_fusion_pass_manager.py b/vllm_ascend/compilation/graph_fusion_pass_manager.py index 29275fec..43d23f37 100644 --- a/vllm_ascend/compilation/graph_fusion_pass_manager.py +++ b/vllm_ascend/compilation/graph_fusion_pass_manager.py @@ -17,17 +17,10 @@ # from torch import fx as fx +from vllm.compilation.passes.inductor_pass import get_pass_context +from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.15.0"): - from vllm.compilation.inductor_pass import get_pass_context # type: ignore - from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore -else: - from vllm.compilation.passes.inductor_pass import get_pass_context - from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass - class GraphFusionPassManager: """ diff --git a/vllm_ascend/compilation/passes/allreduce_rmsnorm_fusion_pass.py b/vllm_ascend/compilation/passes/allreduce_rmsnorm_fusion_pass.py index 525543e0..5fc40058 100644 --- a/vllm_ascend/compilation/passes/allreduce_rmsnorm_fusion_pass.py +++ b/vllm_ascend/compilation/passes/allreduce_rmsnorm_fusion_pass.py @@ -16,6 +16,8 @@ # import torch from torch._inductor.pattern_matcher import Match, PatternMatcherPass, PatternPrettyPrinter +from vllm.compilation.passes.inductor_pass import get_pass_context +from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig from vllm.config.compilation import Range from vllm.distributed import get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce @@ -24,14 +26,6 @@ from vllm.logger import logger from vllm_ascend.compilation.passes.base_pattern import BasePattern from vllm_ascend.compilation.passes.utils.npugraph_ex_utils_check import extra_stream_scope_check -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.15.0"): - from vllm.compilation.inductor_pass import get_pass_context # type: ignore - from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore -else: - from vllm.compilation.passes.inductor_pass import get_pass_context - from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass # computation-communication tiling block is 512 ALLREDUCE_NORM_FUSE_THRESHOLD = 512 diff --git a/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py b/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py index b1f33759..0c6fbde9 100644 --- a/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py +++ b/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py @@ -17,17 +17,13 @@ # import torch from torch._inductor.pattern_matcher import PatternMatcherPass +from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig from vllm.config.compilation import Range from vllm.logger import logger from vllm_ascend.compilation.passes.base_pattern import BasePattern -from vllm_ascend.utils import enable_custom_op, vllm_version_is - -if vllm_version_is("0.15.0"): - from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore -else: - from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass +from vllm_ascend.utils import enable_custom_op class AddRMSNormQuantPattern(BasePattern): diff --git a/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py b/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py index f7dd2832..31b0c6f4 100644 --- a/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py +++ b/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py @@ -17,19 +17,13 @@ # import torch from torch._inductor.pattern_matcher import PatternMatcherPass, PatternPrettyPrinter +from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config.compilation import Range from vllm.logger import logger +from vllm.model_executor.layers.attention import Attention from vllm_ascend.compilation.passes.base_pattern import BasePattern -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("v0.15.0"): - from vllm.attention.layer import Attention # type: ignore - from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore -else: - from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass - from vllm.model_executor.layers.attention import Attention class QKNormRopeFusionPattern(BasePattern): diff --git a/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py b/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py index 614372da..481ff73c 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py @@ -26,7 +26,6 @@ from vllm_ascend.distributed.kv_transfer.kv_pool.cpu_offload.metadata import ( MetadataServerProc, MLAConfig, ) -from vllm_ascend.utils import vllm_version_is if TYPE_CHECKING: from vllm.forward_context import ForwardContext @@ -35,10 +34,7 @@ if TYPE_CHECKING: from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request -if vllm_version_is("v0.15.0"): - from vllm.attention.layer import Attention, MLAAttention # type: ignore -else: - from vllm.model_executor.layers.attention import Attention, MLAAttention +from vllm.model_executor.layers.attention import Attention, MLAAttention @dataclass diff --git a/vllm_ascend/kv_offload/cpu_npu.py b/vllm_ascend/kv_offload/cpu_npu.py index 98a4d892..6932ac53 100644 --- a/vllm_ascend/kv_offload/cpu_npu.py +++ b/vllm_ascend/kv_offload/cpu_npu.py @@ -6,8 +6,6 @@ from vllm.v1.attention.backend import AttentionBackend # type: ignore from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.worker import OffloadingHandler, TransferResult, TransferSpec -from vllm_ascend.utils import vllm_version_is - logger = init_logger(__name__) @@ -155,30 +153,22 @@ class CpuNpuOffloadingHandler(OffloadingHandler): def get_finished(self) -> list[TransferResult]: results: list[TransferResult] = [] - if vllm_version_is("v0.15.0"): - for job_id, event in self.transfer_events.items(): - if event.query(): - results.append((job_id, True)) - self.events_pool.append(event) - for job_id, _ in results: - del self.transfer_events[job_id] - else: - finished_job_ids = [] - for job_id, event in self.transfer_events.items(): - if event.query(): - results.append( - TransferResult( - job_id=job_id, - success=True, - transfer_size=None, - transfer_time=None, - transfer_type=None, - ) + finished_job_ids = [] + for job_id, event in self.transfer_events.items(): + if event.query(): + results.append( + TransferResult( + job_id=job_id, + success=True, + transfer_size=None, + transfer_time=None, + transfer_type=None, ) - finished_job_ids.append(job_id) - self.events_pool.append(event) - for job_id in finished_job_ids: - del self.transfer_events[job_id] + ) + finished_job_ids.append(job_id) + self.events_pool.append(event) + for job_id in finished_job_ids: + del self.transfer_events[job_id] return results def wait(self, job_ids: set[int]) -> None: diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index 5f9e8553..7d7b581d 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -31,7 +31,7 @@ from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm_ascend.utils import vllm_version_is -if not vllm_version_is("0.15.0"): +if not vllm_version_is("0.16.0"): from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore @@ -169,7 +169,7 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): return final_hidden_states -if not vllm_version_is("0.15.0"): +if not vllm_version_is("0.16.0"): # Please remove this inheritance after extending vllm, todo(wxs) class AscendMoERunner(DefaultMoERunner): """ @@ -323,10 +323,10 @@ class AscendFusedMoE(FusedMoE): setup_moe_comm_method(self.moe_config) self.quant_type = self._get_quant_type() - if not vllm_version_is("0.15.0"): + if not vllm_version_is("0.16.0"): self.runner = self._init_runner() - if not vllm_version_is("0.15.0"): + if not vllm_version_is("0.16.0"): def _init_runner(self): # Storing the runner in the FusedMoE is an intermediate state, eventually @@ -372,7 +372,7 @@ class AscendFusedMoE(FusedMoE): """ return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states) - if not vllm_version_is("0.15.0"): + if not vllm_version_is("0.16.0"): def forward( self, @@ -519,8 +519,7 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE): ): AscendFusedMoE.__init__(self, **kwargs) - if not vllm_version_is("0.15.0"): - self._routed_input_transform = routed_input_transform + self._routed_input_transform = routed_input_transform self._shared_experts = shared_experts self.use_overlapped = use_overlapped self.shared_expert_stream = None @@ -533,7 +532,7 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE): logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.") self._gate = gate - if not vllm_version_is("0.15.0"): + if not vllm_version_is("0.16.0"): # Recreate the runner with the correct shared_experts parameter # The parent class created the runner before self._shared_experts was set self.runner = self._init_runner() diff --git a/vllm_ascend/ops/mla.py b/vllm_ascend/ops/mla.py index e7f1f779..09e0ee36 100644 --- a/vllm_ascend/ops/mla.py +++ b/vllm_ascend/ops/mla.py @@ -25,18 +25,13 @@ from torch import nn from vllm.config import CacheConfig, get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import ForwardContext, get_forward_context +from vllm.model_executor.layers.attention import MLAAttention from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttentionWrapper from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.utils.torch_utils import direct_register_custom_op from vllm.v1.attention.backend import AttentionMetadata # type: ignore from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("v0.15.0"): - from vllm.attention.layer import MLAAttention # type: ignore -else: - from vllm.model_executor.layers.attention import MLAAttention class IndexerWrapper(nn.Module): @@ -126,17 +121,16 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper): o_proj=mla_modules.o_proj, ) - if not vllm_version_is("v0.15.0"): - original_process_weights = self.mla_attn.process_weights_after_loading + original_process_weights = self.mla_attn.process_weights_after_loading - def wrapped_process_weights(act_dtype: torch.dtype): - from vllm_ascend.attention.sfa_v1 import AscendSFAImpl + def wrapped_process_weights(act_dtype: torch.dtype): + from vllm_ascend.attention.sfa_v1 import AscendSFAImpl - if not isinstance(self.mla_attn.impl, AscendSFAImpl): - original_process_weights(act_dtype) - self.mla_attn.impl.process_weights_after_loading(act_dtype) + if not isinstance(self.mla_attn.impl, AscendSFAImpl): + original_process_weights(act_dtype) + self.mla_attn.impl.process_weights_after_loading(act_dtype) - self.mla_attn.process_weights_after_loading = wrapped_process_weights + self.mla_attn.process_weights_after_loading = wrapped_process_weights compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index 0f12e27c..52d9a74b 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -19,11 +19,6 @@ import os import vllm_ascend.patch.platform.patch_distributed # noqa import vllm_ascend.patch.platform.patch_mamba_config # noqa import vllm_ascend.patch.platform.patch_sched_yield # noqa -from vllm_ascend import envs -from vllm_ascend.utils import vllm_version_is if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true": import vllm_ascend.patch.platform.patch_multiproc_executor # noqa - -if envs.VLLM_ASCEND_BALANCE_SCHEDULING and vllm_version_is("0.15.0"): - import vllm_ascend.patch.platform.patch_balance_schedule # noqa diff --git a/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py b/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py index e87c5f3f..1bd00e0c 100644 --- a/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py +++ b/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py @@ -1,14 +1,8 @@ import torch import vllm.v1.worker.utils as utils +from vllm.model_executor.layers.attention import Attention from vllm.v1.worker.utils import defaultdict, extract_layer_index -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("v0.15.0"): - from vllm.attention.layer import Attention # type: ignore -else: - from vllm.model_executor.layers.attention import Attention - # Without this patch, it will raise an exception when initialize kv_cache. # TODO To remove the patch, we need check why the original bind_kv_cache raises an NotImplementedError. diff --git a/vllm_ascend/patch/worker/patch_v2_eagle.py b/vllm_ascend/patch/worker/patch_v2_eagle.py index 4ec002b1..3b83f937 100644 --- a/vllm_ascend/patch/worker/patch_v2_eagle.py +++ b/vllm_ascend/patch/worker/patch_v2_eagle.py @@ -21,7 +21,14 @@ import vllm from vllm.v1.worker.gpu.attn_utils import build_slot_mappings_by_layer from vllm.v1.worker.gpu.input_batch import InputBatch from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample -from vllm.v1.worker.gpu.spec_decode.eagle import prepare_eagle_decode, prepare_eagle_inputs + +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("v0.16.0"): + from vllm.v1.worker.gpu.spec_decode.eagle import prepare_eagle_decode, prepare_eagle_inputs +else: + from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs + from vllm_ascend.worker.v2.attn_utils import build_attn_metadata @@ -168,4 +175,7 @@ def propose( return self.draft_tokens[:num_reqs] -vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose = propose +if vllm_version_is("v0.16.0"): + vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose = propose +else: + vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index ba209319..c0af4d00 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -468,7 +468,7 @@ class NPUPlatform(Platform): _CUSTOM_OP_REGISTERED = True @classmethod - def get_attn_backend_cls(cls, selected_backend, attn_selector_config): + def get_attn_backend_cls(cls, selected_backend, attn_selector_config, num_heads: int | None = None): key = (attn_selector_config.use_mla, attn_selector_config.use_sparse) backend_map = { diff --git a/vllm_ascend/quantization/method_adapters.py b/vllm_ascend/quantization/method_adapters.py index 82056bec..f48255b6 100644 --- a/vllm_ascend/quantization/method_adapters.py +++ b/vllm_ascend/quantization/method_adapters.py @@ -117,6 +117,18 @@ class AscendLinearMethod(LinearMethodBase): if hasattr(self.quant_method, "process_weights_after_loading"): self.quant_method.process_weights_after_loading(layer) + def get_computed_params(self) -> set[str]: + """Return parameter name patterns that are computed, not loaded. + + These parameters are computed during process_weights_after_loading + rather than loaded from checkpoint: + - weight_offset: Zero for symmetric quantization + - quant_bias: Computed from weight statistics + - deq_scale: Computed as input_scale * weight_scale + - weight_scale: May be computed or have default values for some models + """ + return {"weight_offset", "quant_bias", "deq_scale", "weight_scale"} + def apply( self, layer: torch.nn.Module, diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py index 337287ea..bc14c274 100644 --- a/vllm_ascend/quantization/modelslim_config.py +++ b/vllm_ascend/quantization/modelslim_config.py @@ -401,12 +401,7 @@ class AscendModelSlimConfig(QuantizationConfig): self.packed_modules_mapping = packed_modules_model_mapping[model_type] prefix = self.quant_prefix_mapper(model_type, prefix) - from vllm_ascend.utils import vllm_version_is - - if vllm_version_is("v0.15.0"): - from vllm.attention.layer import Attention # type: ignore - else: - from vllm.model_executor.layers.attention import Attention + from vllm.model_executor.layers.attention import Attention if model_type != "kimi_k2": if prefix.startswith("language_model"): diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 15612c19..69afa3b6 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -41,7 +41,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num -from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled, vllm_version_is +from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled # Currently we will fix block size to a small one since `num_reqs` can't be too large _PREPARE_INPUTS_BLOCK_SIZE = 4 @@ -357,11 +357,10 @@ class EagleProposer(VllmEagleProposer): is_draft_model=True, draft_attn_metadatas=multi_steps_attn_metadata, ): - if not vllm_version_is("v0.15.0"): - # Reset MOE layer index before first model call - forward_context = get_forward_context() - if forward_context is not None: - forward_context.moe_layer_index = 0 + # Reset MOE layer index before first model call + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 self._runnable( num_input_tokens=num_tokens, @@ -522,11 +521,10 @@ class EagleProposer(VllmEagleProposer): is_draft_model=True, draft_attn_metadatas=multi_steps_attn_metadata, ): - if not vllm_version_is("v0.15.0"): - # Reset MOE layer index for forward pass - forward_context = get_forward_context() - if forward_context is not None: - forward_context.moe_layer_index = 0 + # Reset MOE layer index for forward pass + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 draft_token_ids = self._runnable( num_input_tokens=num_input_tokens, @@ -617,11 +615,10 @@ class EagleProposer(VllmEagleProposer): forward_context.num_accept_tokens = batch_size for draft_step in range(self.num_speculative_tokens - 1): - if not vllm_version_is("v0.15.0"): - # Reset MOE layer index for each draft step iteration - forward_context = get_forward_context() - if forward_context is not None: - forward_context.moe_layer_index = 0 + # Reset MOE layer index for each draft step iteration + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 # Update the inputs. # cast to int32 is crucial when eagle model is compiled. diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 45edaa05..61b15e93 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -16,7 +16,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.compilation.acl_graph import ACLGraphWrapper from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla, update_cos_sin from vllm_ascend.spec_decode.eagle_proposer import EagleProposer -from vllm_ascend.utils import lmhead_tp_enable, vllm_version_is +from vllm_ascend.utils import lmhead_tp_enable class MtpProposer(EagleProposer): @@ -130,11 +130,10 @@ class MtpProposer(EagleProposer): is_draft_model=True, in_profile_run=is_profile, ): - if not vllm_version_is("v0.15.0"): - # Reset MOE layer index for each MTP step iteration - forward_context = get_forward_context() - if forward_context is not None: - forward_context.moe_layer_index = 0 + # Reset MOE layer index for each MTP step iteration + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 previous_hidden_states, positions = self.maybe_pad_and_reduce(previous_hidden_states, positions) self.model(input_ids=input_ids, positions=positions, hidden_states=previous_hidden_states) forward_context = get_forward_context() @@ -341,11 +340,10 @@ class MtpProposer(EagleProposer): num_actual_tokens=num_tokens, is_draft_model=True, ): - if not vllm_version_is("v0.15.0"): - # Reset MOE layer index for each MTP step to match all_moe_layers registration - forward_context = get_forward_context() - if forward_context is not None: - forward_context.moe_layer_index = 0 + # Reset MOE layer index for each MTP step to match all_moe_layers registration + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 with record_function_or_nullcontext("mtp_forward"): model_kwargs = {} diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index dde1699d..5a338166 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -525,12 +525,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: "increase the number of supported shapes, set HCCL_OP_EXPANSION_MODE=AIV." ) - from vllm_ascend.utils import vllm_version_is - - if vllm_version_is("0.15.0"): - arch_name = vllm_config.model_config.architectures[0] - else: - arch_name = vllm_config.model_config.architecture + arch_name = vllm_config.model_config.architecture # If original sizes exceed maximum, sample a representative subset if max_num_batch_sizes < len(original_sizes): diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 9a5ae496..fee69da7 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -138,12 +138,8 @@ if TYPE_CHECKING: else: xgr = LazyLoader("xgr", globals(), "xgrammar") -from vllm_ascend.utils import vllm_version_is -if vllm_version_is("v0.15.0"): - from vllm.attention.layer import Attention, MLAAttention # type: ignore -else: - from vllm.model_executor.layers.attention import Attention, MLAAttention +from vllm.model_executor.layers.attention import Attention, MLAAttention # if true, allow tensor initialization and casting with internal format (e.g., NZ) torch.npu.config.allow_internal_format = True diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 440bd75a..1cadf05b 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -531,6 +531,9 @@ class NPUWorker(WorkerBase): def pin_lora(self, lora_id: int) -> bool: return self.model_runner.pin_lora(lora_id) + def reset_encoder_cache(self) -> None: + self.model_runner.reset_encoder_cache() + def execute_dummy_batch(self) -> None: self.model_runner._dummy_run(num_tokens=self.model_runner.decode_token_per_req, uniform_decode=True)