[CI] cleanup single/multi-card test (#5623)
1. speed up e2e light test.
2. create `2-cards` and `4-cards` folder in multicard
3. move ops to nightly
4. run test in Alphabetical Order
- vLLM version: v0.13.0
- vLLM main:
8be6432bda
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
240
tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py
Normal file
240
tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py
Normal file
@@ -0,0 +1,240 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import contextlib
|
||||
import gc
|
||||
import math
|
||||
import multiprocessing
|
||||
import os
|
||||
from typing import Any
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
||||
|
||||
MODELS = [
|
||||
# Offline data parallel mode will be not supported/useful for dense models
|
||||
# "Qwen/Qwen3-0.6B",
|
||||
"vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||||
]
|
||||
|
||||
|
||||
def _install_spies(counters: dict[str, Any]) -> contextlib.ExitStack:
|
||||
"""Installs thread-safe spies on NPU methods to track invocation counts."""
|
||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
|
||||
def make_spy(cls, method_name, counter):
|
||||
original = getattr(cls, method_name)
|
||||
|
||||
def spy(self, *args, **kwargs):
|
||||
with counter.get_lock():
|
||||
counter.value += 1
|
||||
return original(self, *args, **kwargs)
|
||||
|
||||
return spy
|
||||
|
||||
stack = contextlib.ExitStack()
|
||||
hooks = [
|
||||
(torch.npu.NPUGraph, "replay", counters["replay"]),
|
||||
(torch.npu.NPUGraph, "__init__", counters["capture"]),
|
||||
(NPUModelRunner, "execute_model", counters["exec_model"]),
|
||||
(NPUModelRunner, "_dummy_run", counters["dummy_run"]),
|
||||
]
|
||||
|
||||
for cls, method, counter in hooks:
|
||||
stack.enter_context(
|
||||
patch.object(cls, method, make_spy(cls, method, counter)))
|
||||
|
||||
return stack
|
||||
|
||||
|
||||
def _run_worker_process(
|
||||
rank: int,
|
||||
local_rank: int,
|
||||
world_size: int,
|
||||
master_ip: str,
|
||||
master_port: int,
|
||||
counters: dict[str, Any],
|
||||
model_path: str,
|
||||
max_tokens: int,
|
||||
):
|
||||
"""Main entry point for the worker process."""
|
||||
os.environ.update({
|
||||
"VLLM_DP_RANK": str(rank),
|
||||
"VLLM_DP_RANK_LOCAL": str(local_rank),
|
||||
"VLLM_DP_SIZE": str(world_size),
|
||||
"VLLM_DP_MASTER_IP": master_ip,
|
||||
"VLLM_DP_MASTER_PORT": str(master_port),
|
||||
})
|
||||
|
||||
# Import vLLM only after environment setup
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import (
|
||||
destroy_distributed_environment, destroy_model_parallel)
|
||||
|
||||
# Apply hooks and run inference
|
||||
with _install_spies(counters):
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Simple data sharding
|
||||
chunk_size = len(prompts) // world_size
|
||||
start_idx = rank * chunk_size
|
||||
end_idx = start_idx + chunk_size if rank < world_size - 1 else len(
|
||||
prompts)
|
||||
local_prompts = prompts[start_idx:end_idx]
|
||||
|
||||
llm = LLM(
|
||||
model=model_path,
|
||||
quantization="ascend" if "W8A8" in model_path else None,
|
||||
enable_expert_parallel=True if "DeepSeek" in model_path else False,
|
||||
trust_remote_code=True,
|
||||
# vllm enables async scheduling by default, remove below when vllm >= 0.14.0
|
||||
async_scheduling=False,
|
||||
)
|
||||
|
||||
# Expose model config to the main test process
|
||||
counters["hidden_layers"].value = (
|
||||
llm.llm_engine.model_config.hf_text_config.num_hidden_layers)
|
||||
|
||||
llm.generate(local_prompts,
|
||||
SamplingParams(max_tokens=max_tokens, temperature=0.0))
|
||||
|
||||
# Explicit cleanup is mandatory in multi-process vLLM tests
|
||||
del llm
|
||||
|
||||
destroy_model_parallel()
|
||||
destroy_distributed_environment()
|
||||
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
|
||||
|
||||
# @patch.dict(os.environ, clear=["HCCL_OP_EXPANSION_MODE","VLLM_WORKER_MULTIPROC_METHOD"])
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [4, 36])
|
||||
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
|
||||
def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
# Counter doesn't work in default "spawn" mode
|
||||
monkeypatch.delenv("VLLM_WORKER_MULTIPROC_METHOD", raising=False)
|
||||
|
||||
# Shared counters for cross-process assertion
|
||||
counters = {
|
||||
"replay": multiprocessing.Value("i", 0),
|
||||
"capture": multiprocessing.Value("i", 0),
|
||||
"exec_model": multiprocessing.Value("i", 0),
|
||||
"dummy_run": multiprocessing.Value("i", 0),
|
||||
"hidden_layers": multiprocessing.Value("i", -1),
|
||||
}
|
||||
|
||||
dp_size = 2
|
||||
port = get_open_port()
|
||||
|
||||
# Launch workers
|
||||
workers = []
|
||||
for rank in range(dp_size):
|
||||
p = multiprocessing.Process(
|
||||
target=_run_worker_process,
|
||||
args=(rank, rank, dp_size, "127.0.0.1", port, counters, model,
|
||||
max_tokens),
|
||||
)
|
||||
p.start()
|
||||
workers.append(p)
|
||||
|
||||
# Supervision loop
|
||||
for p in workers:
|
||||
p.join(timeout=900)
|
||||
if p.exitcode != 0:
|
||||
for k in workers:
|
||||
if k.is_alive():
|
||||
k.kill()
|
||||
raise RuntimeError(
|
||||
f"Worker {p.pid} failed with exit code {p.exitcode}")
|
||||
|
||||
actual_capture = counters["capture"].value
|
||||
actual_replay = counters["replay"].value
|
||||
num_execute_model = counters["exec_model"].value
|
||||
num_dummy_run = counters["dummy_run"].value
|
||||
num_layers = counters["hidden_layers"].value
|
||||
|
||||
num_acl_graphs = num_layers + 1
|
||||
num_comm_groups = sum(1 for s in [dp_size, 1]
|
||||
if s > 1) # dp_size=2, tp_size=1
|
||||
|
||||
# Metric 1: Graph Capture (ACL Graph Construction)
|
||||
# Ref: vllm_ascend.utils.update_aclgraph_sizes
|
||||
max_batch_sizes = math.floor((1800 - num_comm_groups * 40) /
|
||||
num_acl_graphs / (1 + num_comm_groups * 2))
|
||||
|
||||
expected_capture = max_batch_sizes * num_acl_graphs * dp_size
|
||||
assert (
|
||||
actual_capture == expected_capture
|
||||
), f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
|
||||
|
||||
# Metric 2: Model Execution (NPUModelRunner.execute_model)
|
||||
# vLLM Step Breakdown:
|
||||
# 1. First step (prefill, 1 prompt)
|
||||
# 2. Generation steps (max_tokens)
|
||||
# 3. Final step (likely EOS/idle step), no replay here
|
||||
total_steps = max_tokens + 1 # this includes the 1 and 2 above
|
||||
expected_exec_model = (total_steps + 1) * dp_size
|
||||
|
||||
assert (
|
||||
num_execute_model == expected_exec_model
|
||||
), f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
|
||||
|
||||
# Metric 3: Dummy Runs (Warmup & Alignment)
|
||||
# vLLM synchronizes globally every 32 steps.
|
||||
# Ref: vllm.v1.engine.core.DPEngineCoreProc._has_global_unfinished_reqs
|
||||
aligned_steps = (total_steps + 31) // 32 * 32
|
||||
|
||||
# Part A: Warmup runs (Profile run + 2 runs per captured graph)
|
||||
warmup_runs = 1 + (2 * max_batch_sizes)
|
||||
soc_version = get_ascend_device_type()
|
||||
if soc_version in {AscendDeviceType.A3} and "DeepSeek" in model:
|
||||
# An extra warmup run is needed for MC2 warmup here
|
||||
warmup_runs += 1
|
||||
|
||||
# Part B: Alignment padding (Empty runs to hit the 32-step boundary)
|
||||
padding_runs = aligned_steps - total_steps
|
||||
|
||||
expected_dummy_run = (warmup_runs + padding_runs) * dp_size
|
||||
|
||||
assert (
|
||||
num_dummy_run == expected_dummy_run
|
||||
), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
|
||||
|
||||
# Metric 4: Graph Replay (Inference Execution)
|
||||
# Replays happen for every aligned step across all graphs.
|
||||
expected_replay = num_acl_graphs * aligned_steps * dp_size
|
||||
|
||||
assert (
|
||||
actual_replay == expected_replay
|
||||
), f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
|
||||
79
tests/e2e/multicard/2-cards/test_data_parallel.py
Normal file
79
tests/e2e/multicard/2-cards/test_data_parallel.py
Normal file
@@ -0,0 +1,79 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""
|
||||
Compare the outputs of vLLM with and without aclgraph.
|
||||
|
||||
Run `pytest tests/multicard/test_data_parallel.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
|
||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
||||
def test_qwen3_inference_dp2(model, max_tokens):
|
||||
moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
||||
quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"]
|
||||
script = "examples/offline_data_parallel.py"
|
||||
|
||||
env = os.environ.copy()
|
||||
|
||||
cmd = [
|
||||
sys.executable,
|
||||
script,
|
||||
"--model",
|
||||
model,
|
||||
"--dp-size",
|
||||
"2",
|
||||
"--tp-size",
|
||||
"1",
|
||||
"--node-size",
|
||||
"1",
|
||||
"--node-rank",
|
||||
"0",
|
||||
"--trust-remote-code",
|
||||
]
|
||||
|
||||
if model in moe_models:
|
||||
cmd.append("--enable-expert-parallel")
|
||||
if model in quantization_models:
|
||||
cmd.append("--quantization")
|
||||
cmd.append("ascend")
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
|
||||
print(output)
|
||||
|
||||
assert "DP rank 0 needs to process" in output
|
||||
assert "DP rank 1 needs to process" in output
|
||||
assert "Generated text:" in output
|
||||
assert proc.returncode == 0
|
||||
34
tests/e2e/multicard/2-cards/test_expert_parallel.py
Normal file
34
tests/e2e/multicard/2-cards/test_expert_parallel.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import pytest
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
|
||||
def test_deepseek_correctness_ep(model_name):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
max_tokens = 5
|
||||
|
||||
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
||||
with VllmRunner(model_name,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
tensor_parallel_size=2) as vllm_model:
|
||||
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(model_name,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=ep_output,
|
||||
outputs_1_lst=tp_output,
|
||||
name_0="ep_output",
|
||||
name_1="tp_output",
|
||||
)
|
||||
239
tests/e2e/multicard/2-cards/test_external_launcher.py
Normal file
239
tests/e2e/multicard/2-cards/test_external_launcher.py
Normal file
@@ -0,0 +1,239 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""
|
||||
Compare the outputs of vLLM with and without aclgraph.
|
||||
|
||||
Run `pytest tests/multicard/test_external_launcher.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch_npu
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
|
||||
MODELS = ["Qwen/Qwen3-0.6B"]
|
||||
MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
|
||||
DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"})
|
||||
def test_qwen3_external_launcher(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--tp-size",
|
||||
"1",
|
||||
"--node-size",
|
||||
"1",
|
||||
"--node-rank",
|
||||
"0",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
|
||||
print(output)
|
||||
|
||||
assert "TP RANKS: [0]" in output
|
||||
assert "TP RANKS: [1]" in output
|
||||
assert "Generated text:" in output
|
||||
assert proc.returncode == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MOE_MODELS)
|
||||
def test_qwen3_moe_external_launcher_ep_tp2(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script), "--model", model, "--tp-size", "2", "--node-size", "1",
|
||||
"--node-rank", "0", "--proc-per-node", "2", "--trust-remote-code",
|
||||
"--enable-expert-parallel"
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
|
||||
print(output)
|
||||
|
||||
assert "TP RANKS: [0, 1]" in output
|
||||
assert "Generated text:" in output
|
||||
assert proc.returncode == 0
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||
def test_qwen3_external_launcher_with_sleepmode():
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script),
|
||||
"--model",
|
||||
"Qwen/Qwen3-8B",
|
||||
"--tp-size",
|
||||
"1",
|
||||
"--node-size",
|
||||
"1",
|
||||
"--node-rank",
|
||||
"0",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enable-sleep-mode",
|
||||
"--temperature",
|
||||
"0",
|
||||
"--model-weight-gib",
|
||||
"16",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=300,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
|
||||
print(output)
|
||||
|
||||
assert "Generated text:" in output
|
||||
assert "Sleep and wake up successfully!!" in output
|
||||
assert proc.returncode == 0
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||
def test_qwen3_external_launcher_with_sleepmode_level2():
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
model_path = snapshot_download("Qwen/Qwen3-8B")
|
||||
# TODO: Add moe model test
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script),
|
||||
"--model",
|
||||
model_path,
|
||||
"--tp-size",
|
||||
"1",
|
||||
"--node-size",
|
||||
"1",
|
||||
"--node-rank",
|
||||
"0",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enable-sleep-mode",
|
||||
"--temperature",
|
||||
"0",
|
||||
"--model-weight-gib",
|
||||
"16",
|
||||
"--sleep-mode-level",
|
||||
"2",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=300,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
|
||||
print(output)
|
||||
|
||||
assert "Generated text:" in output
|
||||
assert "Sleep and wake up successfully!!" in output
|
||||
assert proc.returncode == 0
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
DEVICE_NAME != "Ascend910B",
|
||||
reason="This test is only for Ascend910B devices.",
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@patch.dict(os.environ, {
|
||||
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1",
|
||||
"HCCL_BUFFSIZE": "500"
|
||||
})
|
||||
def test_qwen3_external_launcher_with_matmul_allreduce(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--trust-remote-code",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
print(output)
|
||||
|
||||
assert "Generated text:" in output
|
||||
assert proc.returncode == 0
|
||||
114
tests/e2e/multicard/2-cards/test_full_graph_mode.py
Normal file
114
tests/e2e/multicard/2-cards/test_full_graph_mode.py
Normal file
@@ -0,0 +1,114 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
|
||||
#
|
||||
import os
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
|
||||
def test_qwen3_moe_full_decode_only_tp2():
|
||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
]
|
||||
model = "Qwen/Qwen3-30B-A3B"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
|
||||
}) as runner:
|
||||
vllm_fullgraph_outputs = runner.model.generate(prompts,
|
||||
sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
||||
tensor_parallel_size=2,
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_fullgraph_outputs_list = []
|
||||
for output in vllm_fullgraph_outputs:
|
||||
vllm_fullgraph_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
vllm_eager_outputs_list = []
|
||||
for output in vllm_eager_outputs:
|
||||
vllm_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
outputs_1_lst=vllm_fullgraph_outputs_list,
|
||||
name_0="vllm_eager_outputs",
|
||||
name_1="vllm_fullgraph_outputs",
|
||||
)
|
||||
|
||||
|
||||
def test_qwen3_moe_full_graph_tp2():
|
||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
]
|
||||
model = "Qwen/Qwen3-30B-A3B"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
|
||||
}) as runner:
|
||||
vllm_fullgraph_outputs = runner.model.generate(prompts,
|
||||
sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
||||
tensor_parallel_size=2,
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_fullgraph_outputs_list = []
|
||||
for output in vllm_fullgraph_outputs:
|
||||
vllm_fullgraph_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
vllm_eager_outputs_list = []
|
||||
for output in vllm_eager_outputs:
|
||||
vllm_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
outputs_1_lst=vllm_fullgraph_outputs_list,
|
||||
name_0="vllm_eager_outputs",
|
||||
name_1="vllm_fullgraph_outputs",
|
||||
)
|
||||
25
tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py
Normal file
25
tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
|
||||
MODEL_PATH, do_sample)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
|
||||
def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
|
||||
with VllmRunner(
|
||||
snapshot_download(MODEL_PATH),
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
dtype="half",
|
||||
max_model_len=1024,
|
||||
max_num_seqs=16,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
|
||||
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
@@ -0,0 +1,214 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
|
||||
#
|
||||
"""Compare the short outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
Run `pytest tests/test_offline_inference.py`.
|
||||
"""
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
QWEN_DENSE_MODELS = [
|
||||
"vllm-ascend/Qwen3-0.6B-W8A8",
|
||||
]
|
||||
|
||||
QWEN_W4A8_MODELS = [
|
||||
"vllm-ascend/Qwen3-1.7B-W4A8-V1",
|
||||
]
|
||||
|
||||
DEEPSEEK_W4A8_MODELS = [
|
||||
"vllm-ascend/DeepSeek-V3.1-W4A8-puring",
|
||||
]
|
||||
|
||||
|
||||
def test_deepseek_multistream_moe_tp2():
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"vllm-ascend/DeepSeek-V3-Pruning",
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
additional_config={
|
||||
"enable_multistream_moe": True,
|
||||
"refresh": True,
|
||||
},
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", QWEN_W4A8_MODELS)
|
||||
def test_qwen3_w4a8_dynamic_tp2(model):
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts, max_tokens)
|
||||
|
||||
|
||||
def test_qwen3_moe_sp_tp2() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
|
||||
with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
compilation_config={"pass_config": {
|
||||
"enable_sp": True
|
||||
}},
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
|
||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
|
||||
def test_deepseek_w4a8_accuracy_tp2(model):
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs"
|
||||
]
|
||||
vllm_ds_w4a8_answers = [
|
||||
'逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
|
||||
with VllmRunner(snapshot_download(model),
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
vllm_quant_outputs = vllm_model.model.generate(prompts,
|
||||
sampling_params)
|
||||
|
||||
vllm_quant_outputs_list = []
|
||||
for output in vllm_quant_outputs:
|
||||
vllm_quant_outputs_list.append(
|
||||
([output.outputs[0].index], output.outputs[0].text))
|
||||
vllm_answer_list = []
|
||||
vllm_answer_list = ([([0], answer) for answer in vllm_ds_w4a8_answers])
|
||||
|
||||
check_outputs_equal(outputs_0_lst=vllm_answer_list,
|
||||
outputs_1_lst=vllm_quant_outputs_list,
|
||||
name_0="vllm_quant_outputs",
|
||||
name_1="vllm_answer_outputs")
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "1"})
|
||||
def test_qwen3_moe_fc2_tp2() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
|
||||
with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
||||
def test_deepseek_v2_lite_fc1_tp2() -> None:
|
||||
example_prompts = [
|
||||
"test" * 1001,
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"),
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True,
|
||||
quantization="ascend") as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
||||
def test_qwen3_dense_fc1_tp2(model):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
|
||||
def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
74
tests/e2e/multicard/2-cards/test_offline_weight_load.py
Normal file
74
tests/e2e/multicard/2-cards/test_offline_weight_load.py
Normal file
@@ -0,0 +1,74 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""
|
||||
Run `pytest tests/multicard/test_offline_load_weight.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
MODELS = ["Qwen/Qwen3-30B-A3B"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||
def test_qwen3_offline_load_and_sleepmode_tp2(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--tp-size",
|
||||
"2",
|
||||
"--node-size",
|
||||
"1",
|
||||
"--node-rank",
|
||||
"0",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enable-sleep-mode",
|
||||
"--temperature",
|
||||
"0",
|
||||
"--model-weight-gib",
|
||||
"0.8",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
|
||||
print(output)
|
||||
|
||||
assert "Generated text:" in output
|
||||
assert "Sleep and wake up successfully!!" in output
|
||||
assert proc.returncode == 0
|
||||
48
tests/e2e/multicard/2-cards/test_pipeline_parallel.py
Normal file
48
tests/e2e/multicard/2-cards/test_pipeline_parallel.py
Normal file
@@ -0,0 +1,48 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
import pytest
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen3-0.6B",
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
]
|
||||
|
||||
TENSOR_PARALLELS = [1]
|
||||
PIPELINE_PARALLELS = [2]
|
||||
DIST_EXECUTOR_BACKEND = ["mp", "ray"]
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND)
|
||||
def test_models_pp2(model: str, tp_size: int, pp_size: int,
|
||||
distributed_executor_backend: str) -> None:
|
||||
with VllmRunner(model,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts, 64)
|
||||
85
tests/e2e/multicard/2-cards/test_prefix_caching.py
Normal file
85
tests/e2e/multicard/2-cards/test_prefix_caching.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Compare the with and without prefix caching."""
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
# for MHA
|
||||
"Qwen/Qwen3-8B",
|
||||
# for MLA
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
]
|
||||
|
||||
# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
|
||||
LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
|
||||
| ID | Name | Age | Occupation | Country | Email | Phone Number | Address |
|
||||
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
|
||||
| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL |
|
||||
| 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON |
|
||||
| 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK |
|
||||
| 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW |
|
||||
| 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ |
|
||||
| 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE |
|
||||
| 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY |
|
||||
| 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC |
|
||||
| 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK |
|
||||
| 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC|
|
||||
| 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ |
|
||||
| 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE |
|
||||
| 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA |
|
||||
| 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB |
|
||||
| 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK |
|
||||
| 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD |
|
||||
| 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ |
|
||||
| 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE |
|
||||
| 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA |
|
||||
| 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON |
|
||||
| 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK |
|
||||
| 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA |
|
||||
| 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ|
|
||||
| 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE |
|
||||
| 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO |
|
||||
| 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC |
|
||||
| 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK |
|
||||
| 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA |
|
||||
| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ |
|
||||
| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE |
|
||||
"""
|
||||
|
||||
INPUT_PROMPTS = [
|
||||
LONG_PROMPT +
|
||||
"Question: what is the age of John Doe? Your answer: The age of John Doe is ",
|
||||
LONG_PROMPT +
|
||||
"Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is "
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [50])
|
||||
def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None:
|
||||
with VllmRunner(model,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
prefix_cache_output = vllm_model.generate_greedy(
|
||||
INPUT_PROMPTS, max_tokens)
|
||||
|
||||
with VllmRunner(model,
|
||||
enable_prefix_caching=False,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
outputs_1_lst=prefix_cache_output,
|
||||
name_0="vllm_output",
|
||||
name_1="prefix_cache_output",
|
||||
)
|
||||
44
tests/e2e/multicard/2-cards/test_quantization.py
Normal file
44
tests/e2e/multicard/2-cards/test_quantization.py
Normal file
@@ -0,0 +1,44 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
|
||||
#
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
|
||||
def test_qwen2_5_w8a8_external_quantized_tp2():
|
||||
example_prompts = [
|
||||
"The president of the United States is",
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
golden_results = [
|
||||
'The president of the United States is the head of state and',
|
||||
]
|
||||
|
||||
for i in range(len(vllm_output)):
|
||||
assert golden_results[i] == vllm_output[i][1]
|
||||
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||
126
tests/e2e/multicard/2-cards/test_qwen3_moe.py
Normal file
126
tests/e2e/multicard/2-cards/test_qwen3_moe.py
Normal file
@@ -0,0 +1,126 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
|
||||
#
|
||||
import json
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer, VllmRunner
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
||||
def test_qwen3_moe_distributed_mp_tp2_ep():
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
def test_qwen3_moe_w8a8_distributed_tp2():
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download("vllm-ascend/Qwen3-30B-A3B-W8A8"),
|
||||
max_model_len=8192,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
def test_qwen3_moe_distributed_aiv_tp2():
|
||||
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
dtype = "auto"
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
|
||||
model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
|
||||
port = get_open_port()
|
||||
server_args = [
|
||||
"--max_model_len", "8192", "--tensor_parallel_size", "2",
|
||||
"--enable_expert_parallel", "--quantization", "ascend", "--port",
|
||||
str(port), "--enforce_eager"
|
||||
]
|
||||
env_dict = {"HCCL_BUFFSIZE": "1024"}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
auto_port=False,
|
||||
env_dict=env_dict) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(model=model,
|
||||
prompt="What is deeplearning?",
|
||||
max_tokens=300,
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
n=1)
|
||||
gt_choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
|
||||
# dynamic eplb test
|
||||
# Since pytest runs as a daemon, it conflicts with the dynamic eplb manager
|
||||
# during initialization in offline mode, so the online mode is used instead.
|
||||
env_dict.update({"DYNAMIC_EPLB": "true"})
|
||||
additional_config = {
|
||||
"dynamic_eplb": True,
|
||||
"num_iterations_eplb_update": 100,
|
||||
"num_wait_worker_iterations": 20
|
||||
}
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
auto_port=False,
|
||||
env_dict=env_dict) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(model=model,
|
||||
prompt="What is deeplearning?",
|
||||
max_tokens=300,
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
n=1)
|
||||
eplb_choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert gt_choices[0].text == eplb_choices[
|
||||
0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"
|
||||
93
tests/e2e/multicard/2-cards/test_shared_expert_dp.py
Normal file
93
tests/e2e/multicard/2-cards/test_shared_expert_dp.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"deepseek-ai/DeepSeek-V2-Lite",
|
||||
]
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None:
|
||||
|
||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is", "The capital of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
os.environ["VLLM_ASCEND_ENABLE_FLASHCOMM1"] = "1"
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"enable_shared_expert_dp": True,
|
||||
},
|
||||
) as runner:
|
||||
shared_expert_dp_eager_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [1, 4, 8, 16],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
},
|
||||
additional_config={
|
||||
"enable_shared_expert_dp": True,
|
||||
},
|
||||
) as runner:
|
||||
shared_expert_dp_aclgraph_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
|
||||
vllm_eager_outputs_list = []
|
||||
for output in vllm_eager_outputs:
|
||||
vllm_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
shared_expert_dp_eager_outputs_list = []
|
||||
for output in shared_expert_dp_eager_outputs:
|
||||
shared_expert_dp_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
shared_expert_dp_aclgraph_outputs_list = []
|
||||
for output in shared_expert_dp_aclgraph_outputs:
|
||||
shared_expert_dp_aclgraph_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
outputs_1_lst=shared_expert_dp_eager_outputs_list,
|
||||
name_0="vllm_eager_outputs",
|
||||
name_1="shared_expert_dp_eager_outputs",
|
||||
)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
outputs_1_lst=shared_expert_dp_aclgraph_outputs_list,
|
||||
name_0="vllm_eager_outputs",
|
||||
name_1="shared_expert_dp_aclgraph_outputs",
|
||||
)
|
||||
85
tests/e2e/multicard/2-cards/test_single_request_aclgraph.py
Normal file
85
tests/e2e/multicard/2-cards/test_single_request_aclgraph.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
import asyncio
|
||||
from typing import Any
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
|
||||
MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
|
||||
|
||||
DATA_PARALLELS = [2]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
]
|
||||
|
||||
api_keyword_args = {
|
||||
"max_tokens": 10,
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
|
||||
async def test_models_single_request_aclgraph_dp2(model: str,
|
||||
dp_size: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"TASK_QUEUE_ENABLE": "1",
|
||||
"HCCL_OP_EXPANSION_MODE": "AIV",
|
||||
}
|
||||
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
|
||||
server_args = [
|
||||
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
|
||||
"--data-parallel-size",
|
||||
str(dp_size), "--quantization", "ascend", "--max-model-len",
|
||||
"1024", "--port",
|
||||
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
|
||||
]
|
||||
else:
|
||||
server_args = [
|
||||
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
|
||||
"--data-parallel-size",
|
||||
str(dp_size), "--port",
|
||||
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
vllm_serve_args=server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
|
||||
try:
|
||||
batch = await asyncio.wait_for(client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
),
|
||||
timeout=10.0)
|
||||
except asyncio.TimeoutError:
|
||||
pytest.fail("Model did not return response within 10 seconds")
|
||||
|
||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert choices[0].text, "empty response"
|
||||
Reference in New Issue
Block a user