Main2main upgrade vllm to 0318 commit (#7412)
### What this PR does / why we need it?
Upgrade vllm commit to 0318.
Main content: Added a pre-operation for cleaning up and waiting(default
max 50s) for the completion of the clean up of the NPU memory to some
test cases that failed due to the failure to release the NPU memory in a
timely manner when the previous test cases were executed.
### Does this PR introduce _any_ user-facing change?
NA
### How was this patch tested?
NA
- vLLM version: v0.17.0
- vLLM main:
4497431df6
---------
Signed-off-by: leo-pony <nengjunma@outlook.com>
This commit is contained in:
2
.github/workflows/bot_pr_create.yaml
vendored
2
.github/workflows/bot_pr_create.yaml
vendored
@@ -37,7 +37,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- name: Get vLLM version
|
- name: Get vLLM version
|
||||||
run: |
|
run: |
|
||||||
VLLM_COMMIT=8a680463fab3bc9e6760417cd5c0a6aa58283065
|
VLLM_COMMIT=8b6325758cce5f9c36d38f2462edbd368b97a07c
|
||||||
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
|
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
|
||||||
|
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ RUN apt-get update -y && \
|
|||||||
|
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
# For lint purpose, actually we need make a main2main matching.
|
# For lint purpose, actually we need make a main2main matching.
|
||||||
ARG VLLM_COMMIT=8a680463fab3bc9e6760417cd5c0a6aa58283065
|
ARG VLLM_COMMIT=8b6325758cce5f9c36d38f2462edbd368b97a07c
|
||||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
|
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
|
||||||
cd /vllm-workspace/vllm && \
|
cd /vllm-workspace/vllm && \
|
||||||
git checkout $VLLM_COMMIT
|
git checkout $VLLM_COMMIT
|
||||||
|
|||||||
2
.github/workflows/pr_test_full.yaml
vendored
2
.github/workflows/pr_test_full.yaml
vendored
@@ -75,7 +75,7 @@ jobs:
|
|||||||
name: e2e-full
|
name: e2e-full
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0]
|
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
|
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
|
||||||
uses: ./.github/workflows/_e2e_test.yaml
|
uses: ./.github/workflows/_e2e_test.yaml
|
||||||
|
|||||||
6
.github/workflows/pr_test_light.yaml
vendored
6
.github/workflows/pr_test_light.yaml
vendored
@@ -41,7 +41,7 @@ jobs:
|
|||||||
lint:
|
lint:
|
||||||
uses: ./.github/workflows/_pre_commit.yml
|
uses: ./.github/workflows/_pre_commit.yml
|
||||||
with:
|
with:
|
||||||
vllm: 8a680463fab3bc9e6760417cd5c0a6aa58283065
|
vllm: 8b6325758cce5f9c36d38f2462edbd368b97a07c
|
||||||
changes:
|
changes:
|
||||||
runs-on: linux-aarch64-a2b3-0
|
runs-on: linux-aarch64-a2b3-0
|
||||||
outputs:
|
outputs:
|
||||||
@@ -90,7 +90,7 @@ jobs:
|
|||||||
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
|
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0]
|
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
|
||||||
uses: ./.github/workflows/_unit_test.yaml
|
uses: ./.github/workflows/_unit_test.yaml
|
||||||
with:
|
with:
|
||||||
vllm: ${{ matrix.vllm_version }}
|
vllm: ${{ matrix.vllm_version }}
|
||||||
@@ -102,7 +102,7 @@ jobs:
|
|||||||
name: e2e-light
|
name: e2e-light
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0]
|
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0]
|
||||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||||
needs: [lint, changes]
|
needs: [lint, changes]
|
||||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ jobs:
|
|||||||
name: refresh codecov
|
name: refresh codecov
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065]
|
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c]
|
||||||
uses: ./.github/workflows/_unit_test.yaml
|
uses: ./.github/workflows/_unit_test.yaml
|
||||||
with:
|
with:
|
||||||
vllm: ${{ matrix.vllm_version }}
|
vllm: ${{ matrix.vllm_version }}
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
|
|||||||
|
|
||||||
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
||||||
|-------------|--------------|------------------|-------------|--------------------|
|
|-------------|--------------|------------------|-------------|--------------------|
|
||||||
| main | 8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
|
| main | 8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
|
||||||
|
|
||||||
## Release cadence
|
## Release cadence
|
||||||
|
|
||||||
|
|||||||
@@ -27,6 +27,8 @@ from unittest.mock import patch
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from tests.e2e.conftest import wait_until_npu_memory_free
|
||||||
|
|
||||||
MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
||||||
|
|
||||||
|
|
||||||
@@ -34,6 +36,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
|||||||
@pytest.mark.parametrize("max_tokens", [32])
|
@pytest.mark.parametrize("max_tokens", [32])
|
||||||
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
|
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
|
||||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
||||||
|
@wait_until_npu_memory_free(target_free_percentage=0.95)
|
||||||
def test_qwen3_inference_dp2(model, max_tokens):
|
def test_qwen3_inference_dp2(model, max_tokens):
|
||||||
moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
||||||
quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ from unittest.mock import patch
|
|||||||
import pytest
|
import pytest
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
from tests.e2e.conftest import VllmRunner
|
from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
|
||||||
from tests.e2e.model_utils import check_outputs_equal
|
from tests.e2e.model_utils import check_outputs_equal
|
||||||
|
|
||||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||||
@@ -91,6 +91,7 @@ def test_qwen3_w4a8_dynamic_tp2(model):
|
|||||||
vllm_model.generate_greedy(prompts, max_tokens)
|
vllm_model.generate_greedy(prompts, max_tokens)
|
||||||
|
|
||||||
|
|
||||||
|
@wait_until_npu_memory_free(target_free_percentage=0.95)
|
||||||
def test_qwen3_moe_sp_tp2() -> None:
|
def test_qwen3_moe_sp_tp2() -> None:
|
||||||
example_prompts = [
|
example_prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
@@ -111,6 +112,7 @@ def test_qwen3_moe_sp_tp2() -> None:
|
|||||||
|
|
||||||
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
|
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
|
||||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
|
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
|
||||||
|
@wait_until_npu_memory_free(target_free_percentage=0.95)
|
||||||
def test_deepseek_w4a8_accuracy_tp2(model):
|
def test_deepseek_w4a8_accuracy_tp2(model):
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
|
|||||||
Reference in New Issue
Block a user