Main2main upgrade vllm to 0318 commit (#7412)

### What this PR does / why we need it?
Upgrade vllm commit to 0318. 

Main content: Added a pre-operation for cleaning up and waiting(default
max 50s) for the completion of the clean up of the NPU memory to some
test cases that failed due to the failure to release the NPU memory in a
timely manner when the previous test cases were executed.

### Does this PR introduce _any_ user-facing change?
NA

### How was this patch tested?
NA

- vLLM version: v0.17.0
- vLLM main:
4497431df6

---------

Signed-off-by: leo-pony <nengjunma@outlook.com>
This commit is contained in:
Nengjun Ma
2026-03-19 17:17:36 +08:00
committed by GitHub
parent 05afc7f8c3
commit ee804ce23e
8 changed files with 14 additions and 9 deletions

View File

@@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=8a680463fab3bc9e6760417cd5c0a6aa58283065
VLLM_COMMIT=8b6325758cce5f9c36d38f2462edbd368b97a07c
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
- name: Checkout repository

View File

@@ -27,7 +27,7 @@ RUN apt-get update -y && \
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=8a680463fab3bc9e6760417cd5c0a6aa58283065
ARG VLLM_COMMIT=8b6325758cce5f9c36d38f2462edbd368b97a07c
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
cd /vllm-workspace/vllm && \
git checkout $VLLM_COMMIT

View File

@@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0]
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: 8a680463fab3bc9e6760417cd5c0a6aa58283065
vllm: 8b6325758cce5f9c36d38f2462edbd368b97a07c
changes:
runs-on: linux-aarch64-a2b3-0
outputs:
@@ -90,7 +90,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0]
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
@@ -102,7 +102,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0]
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065]
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}

View File

@@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | 8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
| main | 8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
## Release cadence

View File

@@ -27,6 +27,8 @@ from unittest.mock import patch
import pytest
from tests.e2e.conftest import wait_until_npu_memory_free
MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
@@ -34,6 +36,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
@pytest.mark.parametrize("max_tokens", [32])
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
@wait_until_npu_memory_free(target_free_percentage=0.95)
def test_qwen3_inference_dp2(model, max_tokens):
moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"]

View File

@@ -27,7 +27,7 @@ from unittest.mock import patch
import pytest
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
from tests.e2e.model_utils import check_outputs_equal
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
@@ -91,6 +91,7 @@ def test_qwen3_w4a8_dynamic_tp2(model):
vllm_model.generate_greedy(prompts, max_tokens)
@wait_until_npu_memory_free(target_free_percentage=0.95)
def test_qwen3_moe_sp_tp2() -> None:
example_prompts = [
"Hello, my name is",
@@ -111,6 +112,7 @@ def test_qwen3_moe_sp_tp2() -> None:
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
@wait_until_npu_memory_free(target_free_percentage=0.95)
def test_deepseek_w4a8_accuracy_tp2(model):
prompts = [
"Hello, my name is",