From ee804ce23e515460ab454c15b6d15f8603faf713 Mon Sep 17 00:00:00 2001
From: Nengjun Ma <nengjunma@outlook.com>
Date: Thu, 19 Mar 2026 17:17:36 +0800
Subject: [PATCH] Main2main upgrade vllm to 0318 commit (#7412)

### What this PR does / why we need it?
Upgrade vllm commit to 0318.

Main content: Added a pre-operation for cleaning up and waiting(default
max 50s) for the completion of the clean up of the NPU memory to some
test cases that failed due to the failure to release the NPU memory in a
timely manner when the previous test cases were executed.

### Does this PR introduce _any_ user-facing change?
NA

### How was this patch tested?
NA

- vLLM version: v0.17.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/4497431df654e46fb1fb5e64bf8611e762ae5d87

---------

Signed-off-by: leo-pony <nengjunma@outlook.com>
---
 .github/workflows/bot_pr_create.yaml                        | 2 +-
 .github/workflows/dockerfiles/Dockerfile.lint               | 2 +-
 .github/workflows/pr_test_full.yaml                         | 2 +-
 .github/workflows/pr_test_light.yaml                        | 6 +++---
 .github/workflows/schedule_codecov_refresh.yaml             | 2 +-
 docs/source/community/versioning_policy.md                  | 2 +-
 tests/e2e/multicard/2-cards/test_data_parallel.py           | 3 +++
 .../multicard/2-cards/test_offline_inference_distributed.py | 4 +++-
 8 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml
index bbaba802..31d9bbe8 100644
--- a/.github/workflows/bot_pr_create.yaml
+++ b/.github/workflows/bot_pr_create.yaml
@@ -37,7 +37,7 @@ jobs:
     steps:
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=8a680463fab3bc9e6760417cd5c0a6aa58283065
+          VLLM_COMMIT=8b6325758cce5f9c36d38f2462edbd368b97a07c
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
 
       - name: Checkout repository
diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint
index 277c874e..f92e3f3c 100644
--- a/.github/workflows/dockerfiles/Dockerfile.lint
+++ b/.github/workflows/dockerfiles/Dockerfile.lint
@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=8a680463fab3bc9e6760417cd5c0a6aa58283065
+ARG VLLM_COMMIT=8b6325758cce5f9c36d38f2462edbd368b97a07c
 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
     cd /vllm-workspace/vllm && \
     git checkout $VLLM_COMMIT
diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml
index 76b5cbc8..ca15a121 100644
--- a/.github/workflows/pr_test_full.yaml
+++ b/.github/workflows/pr_test_full.yaml
@@ -75,7 +75,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0]
+        vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml
index e607b0c2..0c693fdd 100644
--- a/.github/workflows/pr_test_light.yaml
+++ b/.github/workflows/pr_test_light.yaml
@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: 8a680463fab3bc9e6760417cd5c0a6aa58283065
+      vllm: 8b6325758cce5f9c36d38f2462edbd368b97a07c
   changes:
     runs-on: linux-aarch64-a2b3-0
     outputs:
@@ -90,7 +90,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     strategy:
       matrix:
-        vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0]
+        vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -102,7 +102,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0]
+        vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml
index 74864db0..fd921458 100644
--- a/.github/workflows/schedule_codecov_refresh.yaml
+++ b/.github/workflows/schedule_codecov_refresh.yaml
@@ -33,7 +33,7 @@ jobs:
     name: refresh codecov
     strategy:
       matrix:
-        vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065]
+        vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md
index 386c7064..4cb47037 100644
--- a/docs/source/community/versioning_policy.md
+++ b/docs/source/community/versioning_policy.md
@@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
 
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | 8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
+|     main    | 8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
 
 ## Release cadence
 
diff --git a/tests/e2e/multicard/2-cards/test_data_parallel.py b/tests/e2e/multicard/2-cards/test_data_parallel.py
index 32927755..c4bc93a2 100644
--- a/tests/e2e/multicard/2-cards/test_data_parallel.py
+++ b/tests/e2e/multicard/2-cards/test_data_parallel.py
@@ -27,6 +27,8 @@ from unittest.mock import patch
 
 import pytest
 
+from tests.e2e.conftest import wait_until_npu_memory_free
+
 MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
 
 
@@ -34,6 +36,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
 @pytest.mark.parametrize("max_tokens", [32])
 @patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
 @patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
+@wait_until_npu_memory_free(target_free_percentage=0.95)
 def test_qwen3_inference_dp2(model, max_tokens):
     moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
     quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"]
diff --git a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
index de971834..bcbbecc0 100644
--- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
@@ -27,7 +27,7 @@ from unittest.mock import patch
 import pytest
 from vllm import SamplingParams
 
-from tests.e2e.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
 from tests.e2e.model_utils import check_outputs_equal
 
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
@@ -91,6 +91,7 @@ def test_qwen3_w4a8_dynamic_tp2(model):
         vllm_model.generate_greedy(prompts, max_tokens)
 
 
+@wait_until_npu_memory_free(target_free_percentage=0.95)
 def test_qwen3_moe_sp_tp2() -> None:
     example_prompts = [
         "Hello, my name is",
@@ -111,6 +112,7 @@ def test_qwen3_moe_sp_tp2() -> None:
 
 @pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
 @patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
+@wait_until_npu_memory_free(target_free_percentage=0.95)
 def test_deepseek_w4a8_accuracy_tp2(model):
     prompts = [
         "Hello, my name is",