1. speed up e2e light test.
2. create `2-cards` and `4-cards` folder in multicard
3. move ops to nightly
4. run test in Alphabetical Order
- vLLM version: v0.13.0
- vLLM main:
8be6432bda
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
49 lines
1.1 KiB
Python
49 lines
1.1 KiB
Python
import os
|
|
import subprocess
|
|
import sys
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
MODELS = ["Qwen/Qwen3-30B-A3B"]
|
|
|
|
|
|
@pytest.mark.parametrize("model", MODELS)
|
|
@pytest.mark.parametrize("max_tokens", [32])
|
|
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3"})
|
|
def test_qwen3_inference_dp2_tp2(model, max_tokens):
|
|
script = "examples/offline_data_parallel.py"
|
|
|
|
env = os.environ.copy()
|
|
|
|
cmd = [
|
|
sys.executable,
|
|
script,
|
|
"--model",
|
|
model,
|
|
"--dp-size",
|
|
"2",
|
|
"--tp-size",
|
|
"2",
|
|
"--node-size",
|
|
"1",
|
|
"--node-rank",
|
|
"0",
|
|
"--trust-remote-code",
|
|
]
|
|
|
|
print(f"Running subprocess: {' '.join(cmd)}")
|
|
proc = subprocess.run(cmd,
|
|
env=env,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
timeout=600)
|
|
output = proc.stdout.decode(errors='ignore')
|
|
|
|
print(output)
|
|
|
|
assert "DP rank 0 needs to process" in output
|
|
assert "DP rank 1 needs to process" in output
|
|
assert "Generated text:" in output
|
|
assert proc.returncode == 0
|