upgrade vLLM to 0.12.0 tag (#4647)

Upgrade vLLM to v0.12.0 tag - vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 - vLLM main: 86e178f7c4 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-03 23:43:05 +08:00
parent 26e8e58cea
commit 3f4c0ea0a0
22 changed files with 97 additions and 47 deletions
--- a/.github/workflows/_e2e_nightly_multi_node.yaml
+++ b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -32,7 +32,7 @@ on:
        description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
      vllm_version:
        required: false
-        default: "86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24"
+        default: "v0.12.0"
        type: string
        description: vllm version to use
      vllm_ascend_remote_url:
--- a/.github/workflows/format_pr_body.yaml
+++ b/.github/workflows/format_pr_body.yaml
@@ -36,7 +36,7 @@ jobs:

      - name: Get vLLM version
        run: |
-          VLLM_COMMIT=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
+          VLLM_COMMIT=v0.12.0
          echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV

      - name: Checkout repository
--- a/.github/workflows/nightly_benchmarks.yaml
+++ b/.github/workflows/nightly_benchmarks.yaml
@@ -51,7 +51,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - vllm_branch: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
+          - vllm_branch: v0.12.0
            vllm_ascend_branch: main
      max-parallel: 1
    container:
--- a/.github/workflows/vllm_ascend_test_nightly_a2.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly_a2.yaml
@@ -86,7 +86,7 @@ jobs:
            tests: tests/e2e/nightly/ops
    uses: ./.github/workflows/_e2e_nightly_single_node.yaml
    with:
-      vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
+      vllm: v0.12.0
      runner: ${{ matrix.test_config.os }}
      tests: ${{ matrix.test_config.tests }}
      image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
@@ -134,7 +134,7 @@ jobs:
              - Qwen3-Next-80B-A3B-Instruct
    uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
    with:
-      vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
+      vllm: v0.12.0
      runner: ${{ matrix.test_config.os }}
      model_list: ${{ toJson(matrix.test_config.model_list) }}
      image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
--- a/.github/workflows/vllm_ascend_test_nightly_a3.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml
@@ -139,7 +139,7 @@ jobs:
            tests: tests/e2e/nightly/models/test_glm4_5.py
    uses: ./.github/workflows/_e2e_nightly_single_node.yaml
    with:
-      vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
+      vllm: v0.12.0
      runner: ${{ matrix.test_config.os }}
      image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
      tests: ${{ matrix.test_config.tests }}
--- a/.github/workflows/vllm_ascend_test_pr_full.yaml
+++ b/.github/workflows/vllm_ascend_test_pr_full.yaml
@@ -50,7 +50,7 @@ jobs:
        with:
          filters: |
            e2e_tracker:
-              - '.github/workflows/vllm_ascend_test.yaml'
+              - '.github/workflows/vllm_ascend_test_pr_full.yaml'
              - '.github/workflows/_e2e_test.yaml'
              - 'vllm_ascend/**'
              - 'csrc/**'
@@ -69,7 +69,7 @@ jobs:
    name: e2e-full
    strategy:
      matrix:
-        vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24]
+        vllm_version: [v0.12.0]
    needs: [changes]
    if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
    uses: ./.github/workflows/_e2e_test.yaml
--- a/.github/workflows/vllm_ascend_test_pr_light.yaml
+++ b/.github/workflows/vllm_ascend_test_pr_light.yaml
@@ -42,7 +42,7 @@ jobs:
  lint:
    uses: ./.github/workflows/pre-commit.yml
    with:
-      vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
+      vllm: v0.12.0
  changes:
    runs-on: ubuntu-latest
    outputs:
@@ -84,7 +84,7 @@ jobs:
        SOC_VERSION: ascend910b1
    strategy:
      matrix:
-        vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24]
+        vllm_version: [v0.12.0]
        
    steps:
      - name: Free up disk space
@@ -137,7 +137,8 @@ jobs:
            --ignore tests/ut/torchair/models/test_torchair_deepseek_v2.py \
            --ignore tests/ut/models/test_qwen2_vl.py \
            --ignore tests/ut/models/test_qwen2_5_vl.py \
-            --ignore tests/ut/models/test_qwen2_5_vl_without_padding.py
+            --ignore tests/ut/models/test_qwen2_5_vl_without_padding.py \
+            --ignore tests/ut/model_loder

      - name: Upload coverage to Codecov
        # only upload coverage when commits merged
@@ -154,7 +155,7 @@ jobs:
    name: e2e-light
    strategy:
      matrix:
-        vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24]
+        vllm_version: [v0.12.0]
    # Note (yikun): If CI resource are limited we can split job into two chain jobs
    needs: [lint, changes]
    # only trigger e2e test after lint passed and the change is e2e related with pull request.
--- a/.github/workflows/vllm_ascend_test_report.yaml
+++ b/.github/workflows/vllm_ascend_test_report.yaml
@@ -72,7 +72,7 @@ jobs:
              - DeepSeek-V2-Lite
    uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
    with:
-      vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
+      vllm: v0.12.0
      runner: ${{ matrix.runner }}
      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
      model_list: ${{ toJson(matrix.model_list) }}
--- a/6
+++ b/6
@@ -48,10 +48,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
-# Revert this change once VLLM_TAG is specified to branch or tag
-# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
-RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
+ARG VLLM_TAG=v0.12.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
--- a/Dockerfile.310p
+++ b/Dockerfile.310p
@@ -39,10 +39,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
-# Revert this change once VLLM_TAG is specified to branch or tag
-# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
-RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
+ARG VLLM_TAG=v0.12.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
--- a/Dockerfile.310p.openEuler
+++ b/Dockerfile.310p.openEuler
@@ -36,10 +36,8 @@ COPY . /vllm-workspace/vllm-ascend/

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
-# Revert this change once VLLM_TAG is specified to branch or tag
-# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
-RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
+ARG VLLM_TAG=v0.12.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
--- a/Dockerfile.a3
+++ b/Dockerfile.a3
@@ -47,10 +47,8 @@ RUN apt-get update -y && \

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
-# Revert this change once VLLM_TAG is specified to branch or tag
-# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
-RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
+ARG VLLM_TAG=v0.12.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
--- a/Dockerfile.a3.openEuler
+++ b/Dockerfile.a3.openEuler
@@ -50,10 +50,8 @@ RUN yum update -y && \

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
-# Revert this change once VLLM_TAG is specified to branch or tag
-# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
-RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
+ARG VLLM_TAG=v0.12.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -50,10 +50,8 @@ RUN yum update -y && \

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
-# Revert this change once VLLM_TAG is specified to branch or tag
-# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
-RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
+ARG VLLM_TAG=v0.12.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
--- a/docs/source/community/versioning_policy.md
+++ b/docs/source/community/versioning_policy.md
@@ -44,7 +44,7 @@ The table below is the release compatibility matrix for vLLM Ascend release.
 For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | v0.11.2 | >= 3.10, < 3.12   | 8.3.RC1 | 2.7.1 / 2.7.1 |
+|     main    | v0.12.0 tag | >= 3.10, < 3.12   | 8.3.RC2 | 2.8.0 / 2.8.0 |

 ## Release cadence

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -77,7 +77,7 @@ myst_substitutions = {
    # CANN image tag
    'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11",
    # vllm version in ci
-    'ci_vllm_version': '86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24',
+    'ci_vllm_version': 'v0.12.0',
 }

 # For cross-file header anchors
--- a/tests/ut/test_platform.py
+++ b/tests/ut/test_platform.py
@@ -24,7 +24,7 @@ class TestNPUPlatform(TestBase):
        mock_vllm_config.cache_config = MagicMock()
        mock_vllm_config.scheduler_config = MagicMock()
        mock_vllm_config.speculative_config = None
-        mock_vllm_config.compilation_config.pass_config.enable_sequence_parallelism = False
+        mock_vllm_config.compilation_config.pass_config.enable_sp = False
        mock_vllm_config.compilation_config.cudagraph_mode = None
        return mock_vllm_config

--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -23,6 +23,7 @@ if HAS_TRITON:
 # isort: off
 import vllm_ascend.patch.platform.patch_sched_yield  # noqa
 import vllm_ascend.patch.worker.patch_distributed  # noqa
+import vllm_ascend.patch.worker.patch_deepseek  # noqa
 import vllm_ascend.patch.worker.patch_roberta  # noqa
 import vllm_ascend.patch.worker.patch_weight_loader  # noqa
 import vllm_ascend.patch.worker.patch_multimodal_merge  # noqa
--- a/vllm_ascend/patch/worker/patch_deepseek.py
+++ b/vllm_ascend/patch/worker/patch_deepseek.py
@@ -0,0 +1,60 @@
+from itertools import islice
+
+import torch
+from vllm.distributed import get_pp_group
+from vllm.model_executor.models.deepseek_v2 import (DeepseekV2Model,
+                                                    _get_llama_4_scaling)
+from vllm.sequence import IntermediateTensors
+
+
+def forward(
+    self,
+    input_ids,
+    positions,
+    intermediate_tensors,
+    inputs_embeds,
+):
+    if get_pp_group().is_first_rank:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embed_input_ids(input_ids)
+        residual = None
+    else:
+        assert intermediate_tensors is not None
+        hidden_states = intermediate_tensors["hidden_states"]
+        residual = intermediate_tensors["residual"]
+
+    # Compute llama 4 scaling once per forward pass if enabled
+    # Note(wxy): This is a hack fix to avoid graph mode error for torch 2.8
+    # We'll find a better way to remove this patch.
+    try:
+        llama_4_scaling_config = getattr(self.config, "llama_4_scaling")
+    except AttributeError:
+        llama_4_scaling_config = None
+    llama_4_scaling: torch.Tensor | None
+    if llama_4_scaling_config is not None:
+        llama_4_scaling = _get_llama_4_scaling(
+            original_max_position_embeddings=llama_4_scaling_config[
+                "original_max_position_embeddings"],
+            scaling_beta=llama_4_scaling_config["beta"],
+            positions=positions,
+        )
+    else:
+        llama_4_scaling = None
+
+    for layer in islice(self.layers, self.start_layer, self.end_layer):
+        hidden_states, residual = layer(positions, hidden_states, residual,
+                                        llama_4_scaling)
+
+    if not get_pp_group().is_last_rank:
+        return IntermediateTensors({
+            "hidden_states": hidden_states,
+            "residual": residual
+        })
+
+    hidden_states, _ = self.norm(hidden_states, residual)
+    return hidden_states
+
+
+DeepseekV2Model.forward = forward
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -159,7 +159,8 @@ class NPUPlatform(Platform):
                compilation_config.splitting_ops = []

        compilation_config.cudagraph_num_of_warmups = 1
-        compilation_config.pass_config.enable_fusion = False
+        compilation_config.pass_config.fuse_norm_quant = False
+        compilation_config.pass_config.fuse_act_quant = False

        if compilation_config.mode not in [
                CompilationMode.NONE, CompilationMode.VLLM_COMPILE
@@ -194,7 +195,7 @@ class NPUPlatform(Platform):
        # to ascend ops && hardwares. We update these sizes here to improve
        # default performance.
        update_default_aclgraph_sizes(vllm_config)
-        # TODO delete graph size update here when compilation_config.pass_config.enable_sequence_parallelism
+        # TODO delete graph size update here when compilation_config.pass_config.enable_sp
        # is supported by vllm-ascend.
        if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \
                enable_sp(vllm_config):
--- a/vllm_ascend/torchair/models/qwen3_moe.py
+++ b/vllm_ascend/torchair/models/qwen3_moe.py
@@ -315,8 +315,8 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
                                                eps=config.rms_norm_eps)

        self.enable_sequence_parallelism = (
-            vllm_config.compilation_config.pass_config.
-            enable_sequence_parallelism if vllm_config is not None else False)
+            vllm_config.compilation_config.pass_config.enable_sp
+            if vllm_config is not None else False)

    def forward(
        self,
@@ -488,7 +488,7 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)

-        self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sequence_parallelism
+        self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sp
        # Set MoE hyperparameters
        self.expert_weights: list[torch.Tensor] = []

--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -773,8 +773,7 @@ def enable_sp(vllm_config=None, enable_shared_expert_dp: bool = False) -> bool:
            from vllm.config import get_current_vllm_config
            vllm_config = get_current_vllm_config()
        _ENABLE_SP = (
-            vllm_config.compilation_config.pass_config.
-            enable_sequence_parallelism
+            vllm_config.compilation_config.pass_config.enable_sp
            or envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM1
            # Flash comm 1 should be enabled by env VLLM_ASCEND_ENABLE_FLASHCOMM1
            # We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.