### What this PR does / why we need it?
**Scope of Changes**:
| File Path |
| :--- |
| `tests/e2e/310p/multicard/test_vl_model_multicard.py` |
| `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` |
| `tests/e2e/310p/test_utils.py` |
| `tests/e2e/conftest.py` |
| `tests/e2e/model_utils.py` |
| `tests/e2e/models/conftest.py` |
| `tests/e2e/models/test_lm_eval_correctness.py` |
| `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` |
| `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` |
| `tests/e2e/multicard/2-cards/test_data_parallel.py` |
| `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` |
| `tests/e2e/multicard/2-cards/test_expert_parallel.py` |
| `tests/e2e/multicard/2-cards/test_external_launcher.py` |
| `tests/e2e/multicard/2-cards/test_full_graph_mode.py` |
| `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` |
| `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` |
| `tests/e2e/multicard/2-cards/test_offline_weight_load.py` |
| `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` |
| `tests/e2e/multicard/2-cards/test_prefix_caching.py` |
| `tests/e2e/multicard/2-cards/test_quantization.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_performance.py` |
| `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` |
| `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` |
| `tests/e2e/multicard/2-cards/test_sp_pass.py` |
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
9562912cea
Signed-off-by: MrZ20 <2609716663@qq.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -29,6 +29,7 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
import torch_npu
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
|
||||
from tests.e2e.conftest import wait_until_npu_memory_free
|
||||
|
||||
MODELS = ["Qwen/Qwen3-0.6B"]
|
||||
@@ -39,9 +40,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"})
|
||||
def test_qwen3_external_launcher(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
@@ -68,7 +67,7 @@ def test_qwen3_external_launcher(model):
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
@@ -81,16 +80,24 @@ def test_qwen3_external_launcher(model):
|
||||
@pytest.mark.parametrize("model", MOE_MODELS)
|
||||
@wait_until_npu_memory_free()
|
||||
def test_qwen3_moe_external_launcher_ep_tp2(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script), "--model", model, "--tp-size", "2", "--node-size", "1",
|
||||
"--node-rank", "0", "--proc-per-node", "2", "--trust-remote-code",
|
||||
"--enable-expert-parallel"
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--tp-size",
|
||||
"2",
|
||||
"--node-size",
|
||||
"1",
|
||||
"--node-rank",
|
||||
"0",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enable-expert-parallel",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
@@ -101,7 +108,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
@@ -113,9 +120,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||
@wait_until_npu_memory_free()
|
||||
def test_qwen3_external_launcher_with_sleepmode():
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
@@ -147,7 +152,7 @@ def test_qwen3_external_launcher_with_sleepmode():
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=300,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
@@ -158,9 +163,7 @@ def test_qwen3_external_launcher_with_sleepmode():
|
||||
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||
def test_qwen3_external_launcher_with_sleepmode_level2():
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
model_path = snapshot_download("Qwen/Qwen3-8B")
|
||||
# TODO: Add moe model test
|
||||
@@ -195,7 +198,7 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=300,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
@@ -210,14 +213,9 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@wait_until_npu_memory_free()
|
||||
@patch.dict(os.environ, {
|
||||
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1",
|
||||
"HCCL_BUFFSIZE": "500"
|
||||
})
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1", "HCCL_BUFFSIZE": "500"})
|
||||
def test_qwen3_external_launcher_with_matmul_allreduce(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
cmd = [
|
||||
sys.executable,
|
||||
@@ -236,7 +234,7 @@ def test_qwen3_external_launcher_with_matmul_allreduce(model):
|
||||
timeout=600,
|
||||
)
|
||||
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
print(output)
|
||||
|
||||
assert "Generated text:" in output
|
||||
|
||||
Reference in New Issue
Block a user