Files
xc-llm-ascend/tests/e2e/multicard/2-cards/test_external_launcher.py
SILONG ZENG 43df2cb2fc [Lint]Style: Convert test/ to ruff format(Batch #1) (#6738)
### What this PR does / why we need it?
**Scope of Changes**:
| File Path |
| :--- |
| `tests/e2e/310p/multicard/test_vl_model_multicard.py` |
| `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` |
| `tests/e2e/310p/test_utils.py` |
| `tests/e2e/conftest.py` |
| `tests/e2e/model_utils.py` |
| `tests/e2e/models/conftest.py` |
| `tests/e2e/models/test_lm_eval_correctness.py` |
| `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` |
| `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` |
| `tests/e2e/multicard/2-cards/test_data_parallel.py` |
| `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` |
| `tests/e2e/multicard/2-cards/test_expert_parallel.py` |
| `tests/e2e/multicard/2-cards/test_external_launcher.py` |
| `tests/e2e/multicard/2-cards/test_full_graph_mode.py` |
| `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` |
| `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` |
| `tests/e2e/multicard/2-cards/test_offline_weight_load.py` |
| `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` |
| `tests/e2e/multicard/2-cards/test_prefix_caching.py` |
| `tests/e2e/multicard/2-cards/test_quantization.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_performance.py` |
| `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` |
| `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` |
| `tests/e2e/multicard/2-cards/test_sp_pass.py` |

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

Signed-off-by: MrZ20 <2609716663@qq.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-03-10 09:52:50 +08:00

242 lines
6.2 KiB
Python

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Compare the outputs of vLLM with and without aclgraph.
Run `pytest tests/multicard/test_external_launcher.py`.
"""
import os
import subprocess
import sys
from pathlib import Path
from unittest.mock import patch
import pytest
import torch_npu
from modelscope import snapshot_download # type: ignore
from tests.e2e.conftest import wait_until_npu_memory_free
MODELS = ["Qwen/Qwen3-0.6B"]
MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@pytest.mark.parametrize("model", MODELS)
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"})
def test_qwen3_external_launcher(model):
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"1",
"--node-size",
"1",
"--node-rank",
"0",
"--proc-per-node",
"2",
"--trust-remote-code",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode(errors="ignore")
print(output)
assert "TP RANKS: [0]" in output
assert "TP RANKS: [1]" in output
assert "Generated text:" in output
assert proc.returncode == 0
@pytest.mark.parametrize("model", MOE_MODELS)
@wait_until_npu_memory_free()
def test_qwen3_moe_external_launcher_ep_tp2(model):
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"2",
"--node-size",
"1",
"--node-rank",
"0",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enable-expert-parallel",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode(errors="ignore")
print(output)
assert "TP RANKS: [0, 1]" in output
assert "Generated text:" in output
assert proc.returncode == 0
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
@wait_until_npu_memory_free()
def test_qwen3_external_launcher_with_sleepmode():
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
"Qwen/Qwen3-8B",
"--tp-size",
"1",
"--node-size",
"1",
"--node-rank",
"0",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enable-sleep-mode",
"--temperature",
"0",
"--model-weight-gib",
"16",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=300,
)
output = proc.stdout.decode(errors="ignore")
print(output)
assert "Generated text:" in output
assert "Sleep and wake up successfully!!" in output
assert proc.returncode == 0
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
def test_qwen3_external_launcher_with_sleepmode_level2():
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
model_path = snapshot_download("Qwen/Qwen3-8B")
# TODO: Add moe model test
cmd = [
sys.executable,
str(script),
"--model",
model_path,
"--tp-size",
"1",
"--node-size",
"1",
"--node-rank",
"0",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enable-sleep-mode",
"--temperature",
"0",
"--model-weight-gib",
"16",
"--sleep-mode-level",
"2",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=300,
)
output = proc.stdout.decode(errors="ignore")
print(output)
assert "Generated text:" in output
assert "Sleep and wake up successfully!!" in output
assert proc.returncode == 0
@pytest.mark.skipif(
DEVICE_NAME != "Ascend910B",
reason="This test is only for Ascend910B devices.",
)
@pytest.mark.parametrize("model", MODELS)
@wait_until_npu_memory_free()
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1", "HCCL_BUFFSIZE": "500"})
def test_qwen3_external_launcher_with_matmul_allreduce(model):
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
cmd = [
sys.executable,
str(script),
"--model",
model,
"--trust-remote-code",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode(errors="ignore")
print(output)
assert "Generated text:" in output
assert proc.returncode == 0