diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index 8789daaf..3f2b4c62 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -32,7 +32,7 @@ on: description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need vllm_version: required: false - default: "86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24" + default: "v0.12.0" type: string description: vllm version to use vllm_ascend_remote_url: diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml index 2302610c..5585e759 100644 --- a/.github/workflows/format_pr_body.yaml +++ b/.github/workflows/format_pr_body.yaml @@ -36,7 +36,7 @@ jobs: - name: Get vLLM version run: | - VLLM_COMMIT=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 + VLLM_COMMIT=v0.12.0 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Checkout repository diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 2ea9247a..e80eaf13 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -51,7 +51,7 @@ jobs: strategy: matrix: include: - - vllm_branch: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 + - vllm_branch: v0.12.0 vllm_ascend_branch: main max-parallel: 1 container: diff --git a/.github/workflows/vllm_ascend_test_nightly_a2.yaml b/.github/workflows/vllm_ascend_test_nightly_a2.yaml index 7b9d1103..f098ccc0 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a2.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a2.yaml @@ -86,7 +86,7 @@ jobs: tests: tests/e2e/nightly/ops uses: ./.github/workflows/_e2e_nightly_single_node.yaml with: - vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 + vllm: v0.12.0 runner: ${{ matrix.test_config.os }} tests: ${{ matrix.test_config.tests }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' @@ -134,7 +134,7 @@ jobs: - Qwen3-Next-80B-A3B-Instruct uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml with: - vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 + vllm: v0.12.0 runner: ${{ matrix.test_config.os }} model_list: ${{ toJson(matrix.test_config.model_list) }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11' diff --git a/.github/workflows/vllm_ascend_test_nightly_a3.yaml b/.github/workflows/vllm_ascend_test_nightly_a3.yaml index 2daedec7..76ea2787 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a3.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml @@ -139,7 +139,7 @@ jobs: tests: tests/e2e/nightly/models/test_glm4_5.py uses: ./.github/workflows/_e2e_nightly_single_node.yaml with: - vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 + vllm: v0.12.0 runner: ${{ matrix.test_config.os }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' tests: ${{ matrix.test_config.tests }} diff --git a/.github/workflows/vllm_ascend_test_pr_full.yaml b/.github/workflows/vllm_ascend_test_pr_full.yaml index ae83eb5b..1c6993cf 100644 --- a/.github/workflows/vllm_ascend_test_pr_full.yaml +++ b/.github/workflows/vllm_ascend_test_pr_full.yaml @@ -50,7 +50,7 @@ jobs: with: filters: | e2e_tracker: - - '.github/workflows/vllm_ascend_test.yaml' + - '.github/workflows/vllm_ascend_test_pr_full.yaml' - '.github/workflows/_e2e_test.yaml' - 'vllm_ascend/**' - 'csrc/**' @@ -69,7 +69,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24] + vllm_version: [v0.12.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/vllm_ascend_test_pr_light.yaml b/.github/workflows/vllm_ascend_test_pr_light.yaml index 2b2f5136..5a84bef2 100644 --- a/.github/workflows/vllm_ascend_test_pr_light.yaml +++ b/.github/workflows/vllm_ascend_test_pr_light.yaml @@ -42,7 +42,7 @@ jobs: lint: uses: ./.github/workflows/pre-commit.yml with: - vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 + vllm: v0.12.0 changes: runs-on: ubuntu-latest outputs: @@ -84,7 +84,7 @@ jobs: SOC_VERSION: ascend910b1 strategy: matrix: - vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24] + vllm_version: [v0.12.0] steps: - name: Free up disk space @@ -137,7 +137,8 @@ jobs: --ignore tests/ut/torchair/models/test_torchair_deepseek_v2.py \ --ignore tests/ut/models/test_qwen2_vl.py \ --ignore tests/ut/models/test_qwen2_5_vl.py \ - --ignore tests/ut/models/test_qwen2_5_vl_without_padding.py + --ignore tests/ut/models/test_qwen2_5_vl_without_padding.py \ + --ignore tests/ut/model_loder - name: Upload coverage to Codecov # only upload coverage when commits merged @@ -154,7 +155,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24] + vllm_version: [v0.12.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/vllm_ascend_test_report.yaml b/.github/workflows/vllm_ascend_test_report.yaml index b13726f6..5e4ec010 100644 --- a/.github/workflows/vllm_ascend_test_report.yaml +++ b/.github/workflows/vllm_ascend_test_report.yaml @@ -72,7 +72,7 @@ jobs: - DeepSeek-V2-Lite uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml with: - vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 + vllm: v0.12.0 runner: ${{ matrix.runner }} image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11 model_list: ${{ toJson(matrix.model_list) }} diff --git a/Dockerfile b/Dockerfile index ddedc805..9511698a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,10 +48,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 -# Revert this change once VLLM_TAG is specified to branch or tag -# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm -RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG) +ARG VLLM_TAG=v0.12.0 +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.310p b/Dockerfile.310p index 1d59a228..0245bb87 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -39,10 +39,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 -# Revert this change once VLLM_TAG is specified to branch or tag -# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm -RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG) +ARG VLLM_TAG=v0.12.0 +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index a38aa5c7..13e38df5 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -36,10 +36,8 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 -# Revert this change once VLLM_TAG is specified to branch or tag -# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm -RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG) +ARG VLLM_TAG=v0.12.0 +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.a3 b/Dockerfile.a3 index ba51228a..d10d3374 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -47,10 +47,8 @@ RUN apt-get update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 -# Revert this change once VLLM_TAG is specified to branch or tag -# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm -RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG) +ARG VLLM_TAG=v0.12.0 +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index dd2bad6a..a270fbf7 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -50,10 +50,8 @@ RUN yum update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 -# Revert this change once VLLM_TAG is specified to branch or tag -# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm -RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG) +ARG VLLM_TAG=v0.12.0 +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index c8cddcba..746667ff 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -50,10 +50,8 @@ RUN yum update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 -# Revert this change once VLLM_TAG is specified to branch or tag -# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm -RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG) +ARG VLLM_TAG=v0.12.0 +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index dcbab2c8..38130569 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -44,7 +44,7 @@ The table below is the release compatibility matrix for vLLM Ascend release. For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly. | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | v0.11.2 | >= 3.10, < 3.12 | 8.3.RC1 | 2.7.1 / 2.7.1 | +| main | v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | ## Release cadence diff --git a/docs/source/conf.py b/docs/source/conf.py index a0c1823c..6fac9eac 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -77,7 +77,7 @@ myst_substitutions = { # CANN image tag 'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11", # vllm version in ci - 'ci_vllm_version': '86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24', + 'ci_vllm_version': 'v0.12.0', } # For cross-file header anchors diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 5dedff7f..d3937c43 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -24,7 +24,7 @@ class TestNPUPlatform(TestBase): mock_vllm_config.cache_config = MagicMock() mock_vllm_config.scheduler_config = MagicMock() mock_vllm_config.speculative_config = None - mock_vllm_config.compilation_config.pass_config.enable_sequence_parallelism = False + mock_vllm_config.compilation_config.pass_config.enable_sp = False mock_vllm_config.compilation_config.cudagraph_mode = None return mock_vllm_config diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index 0d1dd559..a7f9d93c 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -23,6 +23,7 @@ if HAS_TRITON: # isort: off import vllm_ascend.patch.platform.patch_sched_yield # noqa import vllm_ascend.patch.worker.patch_distributed # noqa +import vllm_ascend.patch.worker.patch_deepseek # noqa import vllm_ascend.patch.worker.patch_roberta # noqa import vllm_ascend.patch.worker.patch_weight_loader # noqa import vllm_ascend.patch.worker.patch_multimodal_merge # noqa diff --git a/vllm_ascend/patch/worker/patch_deepseek.py b/vllm_ascend/patch/worker/patch_deepseek.py new file mode 100644 index 00000000..0578f90b --- /dev/null +++ b/vllm_ascend/patch/worker/patch_deepseek.py @@ -0,0 +1,60 @@ +from itertools import islice + +import torch +from vllm.distributed import get_pp_group +from vllm.model_executor.models.deepseek_v2 import (DeepseekV2Model, + _get_llama_4_scaling) +from vllm.sequence import IntermediateTensors + + +def forward( + self, + input_ids, + positions, + intermediate_tensors, + inputs_embeds, +): + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + # Compute llama 4 scaling once per forward pass if enabled + # Note(wxy): This is a hack fix to avoid graph mode error for torch 2.8 + # We'll find a better way to remove this patch. + try: + llama_4_scaling_config = getattr(self.config, "llama_4_scaling") + except AttributeError: + llama_4_scaling_config = None + llama_4_scaling: torch.Tensor | None + if llama_4_scaling_config is not None: + llama_4_scaling = _get_llama_4_scaling( + original_max_position_embeddings=llama_4_scaling_config[ + "original_max_position_embeddings"], + scaling_beta=llama_4_scaling_config["beta"], + positions=positions, + ) + else: + llama_4_scaling = None + + for layer in islice(self.layers, self.start_layer, self.end_layer): + hidden_states, residual = layer(positions, hidden_states, residual, + llama_4_scaling) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +DeepseekV2Model.forward = forward diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index f59d1ed1..950efe21 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -159,7 +159,8 @@ class NPUPlatform(Platform): compilation_config.splitting_ops = [] compilation_config.cudagraph_num_of_warmups = 1 - compilation_config.pass_config.enable_fusion = False + compilation_config.pass_config.fuse_norm_quant = False + compilation_config.pass_config.fuse_act_quant = False if compilation_config.mode not in [ CompilationMode.NONE, CompilationMode.VLLM_COMPILE @@ -194,7 +195,7 @@ class NPUPlatform(Platform): # to ascend ops && hardwares. We update these sizes here to improve # default performance. update_default_aclgraph_sizes(vllm_config) - # TODO delete graph size update here when compilation_config.pass_config.enable_sequence_parallelism + # TODO delete graph size update here when compilation_config.pass_config.enable_sp # is supported by vllm-ascend. if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \ enable_sp(vllm_config): diff --git a/vllm_ascend/torchair/models/qwen3_moe.py b/vllm_ascend/torchair/models/qwen3_moe.py index 10c82816..8338946f 100644 --- a/vllm_ascend/torchair/models/qwen3_moe.py +++ b/vllm_ascend/torchair/models/qwen3_moe.py @@ -315,8 +315,8 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer): eps=config.rms_norm_eps) self.enable_sequence_parallelism = ( - vllm_config.compilation_config.pass_config. - enable_sequence_parallelism if vllm_config is not None else False) + vllm_config.compilation_config.pass_config.enable_sp + if vllm_config is not None else False) def forward( self, @@ -488,7 +488,7 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) - self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sequence_parallelism + self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sp # Set MoE hyperparameters self.expert_weights: list[torch.Tensor] = [] diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index fafd6bb1..b4b8269b 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -773,8 +773,7 @@ def enable_sp(vllm_config=None, enable_shared_expert_dp: bool = False) -> bool: from vllm.config import get_current_vllm_config vllm_config = get_current_vllm_config() _ENABLE_SP = ( - vllm_config.compilation_config.pass_config. - enable_sequence_parallelism + vllm_config.compilation_config.pass_config.enable_sp or envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM1 # Flash comm 1 should be enabled by env VLLM_ASCEND_ENABLE_FLASHCOMM1 # We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.