Main2main upgrade vllm to 0318 commit (#7412)

### What this PR does / why we need it? Upgrade vllm commit to 0318. Main content: Added a pre-operation for cleaning up and waiting(default max 50s) for the completion of the clean up of the NPU memory to some test cases that failed due to the failure to release the NPU memory in a timely manner when the previous test cases were executed. ### Does this PR introduce _any_ user-facing change? NA ### How was this patch tested? NA - vLLM version: v0.17.0 - vLLM main: 4497431df6 --------- Signed-off-by: leo-pony <nengjunma@outlook.com>
2026-03-19 17:17:36 +08:00
parent 05afc7f8c3
commit ee804ce23e
8 changed files with 14 additions and 9 deletions
--- a/tests/e2e/multicard/2-cards/test_data_parallel.py
+++ b/tests/e2e/multicard/2-cards/test_data_parallel.py
@@ -27,6 +27,8 @@ from unittest.mock import patch

 import pytest

+from tests.e2e.conftest import wait_until_npu_memory_free
+
 MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]


@@ -34,6 +36,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
@pytest.mark.parametrize("max_tokens", [32])
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
+@wait_until_npu_memory_free(target_free_percentage=0.95)
 def test_qwen3_inference_dp2(model, max_tokens):
    moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
    quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"]
--- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
@@ -27,7 +27,7 @@ from unittest.mock import patch
 import pytest
 from vllm import SamplingParams

-from tests.e2e.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
 from tests.e2e.model_utils import check_outputs_equal

 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
@@ -91,6 +91,7 @@ def test_qwen3_w4a8_dynamic_tp2(model):
        vllm_model.generate_greedy(prompts, max_tokens)


+@wait_until_npu_memory_free(target_free_percentage=0.95)
 def test_qwen3_moe_sp_tp2() -> None:
    example_prompts = [
        "Hello, my name is",
@@ -111,6 +112,7 @@ def test_qwen3_moe_sp_tp2() -> None:

@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
+@wait_until_npu_memory_free(target_free_percentage=0.95)
 def test_deepseek_w4a8_accuracy_tp2(model):
    prompts = [
        "Hello, my name is",