[CI] cleanup single/multi-card test (#5623)

1. speed up e2e light test.
2. create `2-cards` and `4-cards` folder in multicard
3. move ops to nightly
4. run test in Alphabetical Order

- vLLM version: v0.13.0
- vLLM main:
8be6432bda

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2026-01-07 14:13:34 +08:00
committed by GitHub
parent 1afbc01ed4
commit 6f7a81cd9f
30 changed files with 114 additions and 117 deletions

View File

@@ -0,0 +1,240 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import gc
import math
import multiprocessing
import os
from typing import Any
from unittest.mock import patch
import pytest
import torch
from vllm.utils.network_utils import get_open_port
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
MODELS = [
# Offline data parallel mode will be not supported/useful for dense models
# "Qwen/Qwen3-0.6B",
"vllm-ascend/DeepSeek-V2-Lite-W8A8",
]
def _install_spies(counters: dict[str, Any]) -> contextlib.ExitStack:
"""Installs thread-safe spies on NPU methods to track invocation counts."""
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
def make_spy(cls, method_name, counter):
original = getattr(cls, method_name)
def spy(self, *args, **kwargs):
with counter.get_lock():
counter.value += 1
return original(self, *args, **kwargs)
return spy
stack = contextlib.ExitStack()
hooks = [
(torch.npu.NPUGraph, "replay", counters["replay"]),
(torch.npu.NPUGraph, "__init__", counters["capture"]),
(NPUModelRunner, "execute_model", counters["exec_model"]),
(NPUModelRunner, "_dummy_run", counters["dummy_run"]),
]
for cls, method, counter in hooks:
stack.enter_context(
patch.object(cls, method, make_spy(cls, method, counter)))
return stack
def _run_worker_process(
rank: int,
local_rank: int,
world_size: int,
master_ip: str,
master_port: int,
counters: dict[str, Any],
model_path: str,
max_tokens: int,
):
"""Main entry point for the worker process."""
os.environ.update({
"VLLM_DP_RANK": str(rank),
"VLLM_DP_RANK_LOCAL": str(local_rank),
"VLLM_DP_SIZE": str(world_size),
"VLLM_DP_MASTER_IP": master_ip,
"VLLM_DP_MASTER_PORT": str(master_port),
})
# Import vLLM only after environment setup
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import (
destroy_distributed_environment, destroy_model_parallel)
# Apply hooks and run inference
with _install_spies(counters):
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Simple data sharding
chunk_size = len(prompts) // world_size
start_idx = rank * chunk_size
end_idx = start_idx + chunk_size if rank < world_size - 1 else len(
prompts)
local_prompts = prompts[start_idx:end_idx]
llm = LLM(
model=model_path,
quantization="ascend" if "W8A8" in model_path else None,
enable_expert_parallel=True if "DeepSeek" in model_path else False,
trust_remote_code=True,
# vllm enables async scheduling by default, remove below when vllm >= 0.14.0
async_scheduling=False,
)
# Expose model config to the main test process
counters["hidden_layers"].value = (
llm.llm_engine.model_config.hf_text_config.num_hidden_layers)
llm.generate(local_prompts,
SamplingParams(max_tokens=max_tokens, temperature=0.0))
# Explicit cleanup is mandatory in multi-process vLLM tests
del llm
destroy_model_parallel()
destroy_distributed_environment()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
torch.npu.empty_cache()
torch.npu.reset_peak_memory_stats()
# @patch.dict(os.environ, clear=["HCCL_OP_EXPANSION_MODE","VLLM_WORKER_MULTIPROC_METHOD"])
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [4, 36])
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
def test_models_aclgraph_capture_replay_metrics_dp2(
model: str,
max_tokens: int,
monkeypatch: pytest.MonkeyPatch,
) -> None:
# Counter doesn't work in default "spawn" mode
monkeypatch.delenv("VLLM_WORKER_MULTIPROC_METHOD", raising=False)
# Shared counters for cross-process assertion
counters = {
"replay": multiprocessing.Value("i", 0),
"capture": multiprocessing.Value("i", 0),
"exec_model": multiprocessing.Value("i", 0),
"dummy_run": multiprocessing.Value("i", 0),
"hidden_layers": multiprocessing.Value("i", -1),
}
dp_size = 2
port = get_open_port()
# Launch workers
workers = []
for rank in range(dp_size):
p = multiprocessing.Process(
target=_run_worker_process,
args=(rank, rank, dp_size, "127.0.0.1", port, counters, model,
max_tokens),
)
p.start()
workers.append(p)
# Supervision loop
for p in workers:
p.join(timeout=900)
if p.exitcode != 0:
for k in workers:
if k.is_alive():
k.kill()
raise RuntimeError(
f"Worker {p.pid} failed with exit code {p.exitcode}")
actual_capture = counters["capture"].value
actual_replay = counters["replay"].value
num_execute_model = counters["exec_model"].value
num_dummy_run = counters["dummy_run"].value
num_layers = counters["hidden_layers"].value
num_acl_graphs = num_layers + 1
num_comm_groups = sum(1 for s in [dp_size, 1]
if s > 1) # dp_size=2, tp_size=1
# Metric 1: Graph Capture (ACL Graph Construction)
# Ref: vllm_ascend.utils.update_aclgraph_sizes
max_batch_sizes = math.floor((1800 - num_comm_groups * 40) /
num_acl_graphs / (1 + num_comm_groups * 2))
expected_capture = max_batch_sizes * num_acl_graphs * dp_size
assert (
actual_capture == expected_capture
), f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
# Metric 2: Model Execution (NPUModelRunner.execute_model)
# vLLM Step Breakdown:
# 1. First step (prefill, 1 prompt)
# 2. Generation steps (max_tokens)
# 3. Final step (likely EOS/idle step), no replay here
total_steps = max_tokens + 1 # this includes the 1 and 2 above
expected_exec_model = (total_steps + 1) * dp_size
assert (
num_execute_model == expected_exec_model
), f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
# Metric 3: Dummy Runs (Warmup & Alignment)
# vLLM synchronizes globally every 32 steps.
# Ref: vllm.v1.engine.core.DPEngineCoreProc._has_global_unfinished_reqs
aligned_steps = (total_steps + 31) // 32 * 32
# Part A: Warmup runs (Profile run + 2 runs per captured graph)
warmup_runs = 1 + (2 * max_batch_sizes)
soc_version = get_ascend_device_type()
if soc_version in {AscendDeviceType.A3} and "DeepSeek" in model:
# An extra warmup run is needed for MC2 warmup here
warmup_runs += 1
# Part B: Alignment padding (Empty runs to hit the 32-step boundary)
padding_runs = aligned_steps - total_steps
expected_dummy_run = (warmup_runs + padding_runs) * dp_size
assert (
num_dummy_run == expected_dummy_run
), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
# Metric 4: Graph Replay (Inference Execution)
# Replays happen for every aligned step across all graphs.
expected_replay = num_acl_graphs * aligned_steps * dp_size
assert (
actual_replay == expected_replay
), f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"

View File

@@ -0,0 +1,79 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Compare the outputs of vLLM with and without aclgraph.
Run `pytest tests/multicard/test_data_parallel.py`.
"""
import os
import subprocess
import sys
from unittest.mock import patch
import pytest
MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
def test_qwen3_inference_dp2(model, max_tokens):
moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"]
script = "examples/offline_data_parallel.py"
env = os.environ.copy()
cmd = [
sys.executable,
script,
"--model",
model,
"--dp-size",
"2",
"--tp-size",
"1",
"--node-size",
"1",
"--node-rank",
"0",
"--trust-remote-code",
]
if model in moe_models:
cmd.append("--enable-expert-parallel")
if model in quantization_models:
cmd.append("--quantization")
cmd.append("ascend")
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600)
output = proc.stdout.decode(errors='ignore')
print(output)
assert "DP rank 0 needs to process" in output
assert "DP rank 1 needs to process" in output
assert "Generated text:" in output
assert proc.returncode == 0

View File

@@ -0,0 +1,34 @@
import pytest
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
@pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
def test_deepseek_correctness_ep(model_name):
example_prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
max_tokens = 5
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
with VllmRunner(model_name,
cudagraph_capture_sizes=[1, 2, 4, 8],
tensor_parallel_size=2) as vllm_model:
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(model_name,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
enable_expert_parallel=True) as vllm_model:
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=ep_output,
outputs_1_lst=tp_output,
name_0="ep_output",
name_1="tp_output",
)

View File

@@ -0,0 +1,239 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Compare the outputs of vLLM with and without aclgraph.
Run `pytest tests/multicard/test_external_launcher.py`.
"""
import os
import subprocess
import sys
from pathlib import Path
from unittest.mock import patch
import pytest
import torch_npu
from modelscope import snapshot_download # type: ignore
MODELS = ["Qwen/Qwen3-0.6B"]
MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@pytest.mark.parametrize("model", MODELS)
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"})
def test_qwen3_external_launcher(model):
script = Path(
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"1",
"--node-size",
"1",
"--node-rank",
"0",
"--proc-per-node",
"2",
"--trust-remote-code",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode(errors='ignore')
print(output)
assert "TP RANKS: [0]" in output
assert "TP RANKS: [1]" in output
assert "Generated text:" in output
assert proc.returncode == 0
@pytest.mark.parametrize("model", MOE_MODELS)
def test_qwen3_moe_external_launcher_ep_tp2(model):
script = Path(
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script), "--model", model, "--tp-size", "2", "--node-size", "1",
"--node-rank", "0", "--proc-per-node", "2", "--trust-remote-code",
"--enable-expert-parallel"
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode(errors='ignore')
print(output)
assert "TP RANKS: [0, 1]" in output
assert "Generated text:" in output
assert proc.returncode == 0
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
def test_qwen3_external_launcher_with_sleepmode():
script = Path(
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
"Qwen/Qwen3-8B",
"--tp-size",
"1",
"--node-size",
"1",
"--node-rank",
"0",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enable-sleep-mode",
"--temperature",
"0",
"--model-weight-gib",
"16",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=300,
)
output = proc.stdout.decode(errors='ignore')
print(output)
assert "Generated text:" in output
assert "Sleep and wake up successfully!!" in output
assert proc.returncode == 0
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
def test_qwen3_external_launcher_with_sleepmode_level2():
script = Path(
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
model_path = snapshot_download("Qwen/Qwen3-8B")
# TODO: Add moe model test
cmd = [
sys.executable,
str(script),
"--model",
model_path,
"--tp-size",
"1",
"--node-size",
"1",
"--node-rank",
"0",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enable-sleep-mode",
"--temperature",
"0",
"--model-weight-gib",
"16",
"--sleep-mode-level",
"2",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=300,
)
output = proc.stdout.decode(errors='ignore')
print(output)
assert "Generated text:" in output
assert "Sleep and wake up successfully!!" in output
assert proc.returncode == 0
@pytest.mark.skipif(
DEVICE_NAME != "Ascend910B",
reason="This test is only for Ascend910B devices.",
)
@pytest.mark.parametrize("model", MODELS)
@patch.dict(os.environ, {
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1",
"HCCL_BUFFSIZE": "500"
})
def test_qwen3_external_launcher_with_matmul_allreduce(model):
script = Path(
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
cmd = [
sys.executable,
str(script),
"--model",
model,
"--trust-remote-code",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode(errors='ignore')
print(output)
assert "Generated text:" in output
assert proc.returncode == 0

View File

@@ -0,0 +1,114 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
#
import os
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
def test_qwen3_moe_full_decode_only_tp2():
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
del os.environ['HCCL_OP_EXPANSION_MODE']
prompts = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
]
model = "Qwen/Qwen3-30B-A3B"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
}) as runner:
vllm_fullgraph_outputs = runner.model.generate(prompts,
sampling_params)
with VllmRunner(
model,
max_model_len=1024,
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
tensor_parallel_size=2,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
vllm_fullgraph_outputs_list = []
for output in vllm_fullgraph_outputs:
vllm_fullgraph_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list = []
for output in vllm_eager_outputs:
vllm_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,
outputs_1_lst=vllm_fullgraph_outputs_list,
name_0="vllm_eager_outputs",
name_1="vllm_fullgraph_outputs",
)
def test_qwen3_moe_full_graph_tp2():
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
del os.environ['HCCL_OP_EXPANSION_MODE']
prompts = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
]
model = "Qwen/Qwen3-30B-A3B"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={
"cudagraph_mode": "FULL",
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
}) as runner:
vllm_fullgraph_outputs = runner.model.generate(prompts,
sampling_params)
with VllmRunner(
model,
max_model_len=1024,
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
tensor_parallel_size=2,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
vllm_fullgraph_outputs_list = []
for output in vllm_fullgraph_outputs:
vllm_fullgraph_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list = []
for output in vllm_eager_outputs:
vllm_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,
outputs_1_lst=vllm_fullgraph_outputs_list,
name_0="vllm_eager_outputs",
name_1="vllm_fullgraph_outputs",
)

View File

@@ -0,0 +1,25 @@
import pytest
from modelscope import snapshot_download # type: ignore
from tests.e2e.conftest import VllmRunner
from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
MODEL_PATH, do_sample)
@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
with VllmRunner(
snapshot_download(MODEL_PATH),
enable_lora=True,
max_loras=4,
dtype="half",
max_model_len=1024,
max_num_seqs=16,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output[i] == EXPECTED_LORA_OUTPUT[i]

View File

@@ -0,0 +1,214 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
#
"""Compare the short outputs of HF and vLLM when using greedy sampling.
Run `pytest tests/test_offline_inference.py`.
"""
import os
from unittest.mock import patch
import pytest
from modelscope import snapshot_download # type: ignore
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
QWEN_DENSE_MODELS = [
"vllm-ascend/Qwen3-0.6B-W8A8",
]
QWEN_W4A8_MODELS = [
"vllm-ascend/Qwen3-1.7B-W4A8-V1",
]
DEEPSEEK_W4A8_MODELS = [
"vllm-ascend/DeepSeek-V3.1-W4A8-puring",
]
def test_deepseek_multistream_moe_tp2():
example_prompts = [
"Hello, my name is",
]
dtype = "half"
max_tokens = 5
with VllmRunner(
"vllm-ascend/DeepSeek-V3-Pruning",
dtype=dtype,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend="mp",
additional_config={
"enable_multistream_moe": True,
"refresh": True,
},
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
@pytest.mark.parametrize("model", QWEN_W4A8_MODELS)
def test_qwen3_w4a8_dynamic_tp2(model):
prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
snapshot_download(model),
max_model_len=8192,
dtype="auto",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(prompts, max_tokens)
def test_qwen3_moe_sp_tp2() -> None:
example_prompts = [
"Hello, my name is",
]
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)
with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
dtype="auto",
tensor_parallel_size=2,
distributed_executor_backend="mp",
compilation_config={"pass_config": {
"enable_sp": True
}},
enable_expert_parallel=True,
enforce_eager=True) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
def test_deepseek_w4a8_accuracy_tp2(model):
prompts = [
"Hello, my name is", "The president of the United States is",
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs"
]
vllm_ds_w4a8_answers = [
'逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
]
sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
with VllmRunner(snapshot_download(model),
dtype="auto",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
enable_expert_parallel=True) as vllm_model:
vllm_quant_outputs = vllm_model.model.generate(prompts,
sampling_params)
vllm_quant_outputs_list = []
for output in vllm_quant_outputs:
vllm_quant_outputs_list.append(
([output.outputs[0].index], output.outputs[0].text))
vllm_answer_list = []
vllm_answer_list = ([([0], answer) for answer in vllm_ds_w4a8_answers])
check_outputs_equal(outputs_0_lst=vllm_answer_list,
outputs_1_lst=vllm_quant_outputs_list,
name_0="vllm_quant_outputs",
name_1="vllm_answer_outputs")
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "1"})
def test_qwen3_moe_fc2_tp2() -> None:
example_prompts = [
"Hello, my name is",
]
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)
with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
dtype="auto",
tensor_parallel_size=2,
distributed_executor_backend="mp",
enable_expert_parallel=True,
enforce_eager=True) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
def test_deepseek_v2_lite_fc1_tp2() -> None:
example_prompts = [
"test" * 1001,
]
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)
with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"),
dtype="auto",
tensor_parallel_size=2,
distributed_executor_backend="mp",
enable_expert_parallel=True,
enforce_eager=True,
quantization="ascend") as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
def test_qwen3_dense_fc1_tp2(model):
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
snapshot_download(model),
max_model_len=8192,
dtype="auto",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
snapshot_download(model),
max_model_len=8192,
dtype="auto",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)

View File

@@ -0,0 +1,74 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Run `pytest tests/multicard/test_offline_load_weight.py`.
"""
import os
import subprocess
import sys
from pathlib import Path
from unittest.mock import patch
import pytest
MODELS = ["Qwen/Qwen3-30B-A3B"]
@pytest.mark.parametrize("model", MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
def test_qwen3_offline_load_and_sleepmode_tp2(model):
script = Path(
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"2",
"--node-size",
"1",
"--node-rank",
"0",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enable-sleep-mode",
"--temperature",
"0",
"--model-weight-gib",
"0.8",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode(errors='ignore')
print(output)
assert "Generated text:" in output
assert "Sleep and wake up successfully!!" in output
assert proc.returncode == 0

View File

@@ -0,0 +1,48 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import pytest
from tests.e2e.conftest import VllmRunner
MODELS = [
"Qwen/Qwen3-0.6B",
"deepseek-ai/DeepSeek-V2-Lite-Chat",
]
TENSOR_PARALLELS = [1]
PIPELINE_PARALLELS = [2]
DIST_EXECUTOR_BACKEND = ["mp", "ray"]
prompts = [
"Hello, my name is",
"The future of AI is",
]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND)
def test_models_pp2(model: str, tp_size: int, pp_size: int,
distributed_executor_backend: str) -> None:
with VllmRunner(model,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend=distributed_executor_backend,
gpu_memory_utilization=0.7) as vllm_model:
vllm_model.generate_greedy(prompts, 64)

View File

@@ -0,0 +1,85 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Compare the with and without prefix caching."""
import pytest
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
MODELS = [
# for MHA
"Qwen/Qwen3-8B",
# for MLA
"deepseek-ai/DeepSeek-V2-Lite-Chat"
]
# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
| ID | Name | Age | Occupation | Country | Email | Phone Number | Address |
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL |
| 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON |
| 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK |
| 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW |
| 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ |
| 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE |
| 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY |
| 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC |
| 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK |
| 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC|
| 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ |
| 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE |
| 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA |
| 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB |
| 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK |
| 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD |
| 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ |
| 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE |
| 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA |
| 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON |
| 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK |
| 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA |
| 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ|
| 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE |
| 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO |
| 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC |
| 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK |
| 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA |
| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ |
| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE |
"""
INPUT_PROMPTS = [
LONG_PROMPT +
"Question: what is the age of John Doe? Your answer: The age of John Doe is ",
LONG_PROMPT +
"Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is "
]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50])
def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None:
with VllmRunner(model,
max_model_len=2048,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7) as vllm_model:
prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)
with VllmRunner(model,
enable_prefix_caching=False,
max_model_len=2048,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_output,
outputs_1_lst=prefix_cache_output,
name_0="vllm_output",
name_1="prefix_cache_output",
)

View File

@@ -0,0 +1,44 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
#
from modelscope import snapshot_download # type: ignore
from tests.e2e.conftest import VllmRunner
def test_qwen2_5_w8a8_external_quantized_tp2():
example_prompts = [
"The president of the United States is",
]
max_tokens = 5
with VllmRunner(
snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
max_model_len=4096,
gpu_memory_utilization=0.8,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
golden_results = [
'The president of the United States is the head of state and',
]
for i in range(len(vllm_output)):
assert golden_results[i] == vllm_output[i][1]
print(f"Generated text: {vllm_output[i][1]!r}")

View File

@@ -0,0 +1,126 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
#
import json
import os
from unittest.mock import patch
import openai
import pytest
from modelscope import snapshot_download # type: ignore
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer, VllmRunner
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
def test_qwen3_moe_distributed_mp_tp2_ep():
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
"Qwen/Qwen3-30B-A3B",
tensor_parallel_size=2,
enable_expert_parallel=True,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend="mp",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
def test_qwen3_moe_w8a8_distributed_tp2():
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
snapshot_download("vllm-ascend/Qwen3-30B-A3B-W8A8"),
max_model_len=8192,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
def test_qwen3_moe_distributed_aiv_tp2():
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
example_prompts = [
"Hello, my name is",
]
dtype = "auto"
max_tokens = 5
with VllmRunner(
"Qwen/Qwen3-30B-A3B",
dtype=dtype,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
@pytest.mark.asyncio
async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
port = get_open_port()
server_args = [
"--max_model_len", "8192", "--tensor_parallel_size", "2",
"--enable_expert_parallel", "--quantization", "ascend", "--port",
str(port), "--enforce_eager"
]
env_dict = {"HCCL_BUFFSIZE": "1024"}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
auto_port=False,
env_dict=env_dict) as server:
client = server.get_async_client()
batch = await client.completions.create(model=model,
prompt="What is deeplearning?",
max_tokens=300,
temperature=0,
top_p=1.0,
n=1)
gt_choices: list[openai.types.CompletionChoice] = batch.choices
# dynamic eplb test
# Since pytest runs as a daemon, it conflicts with the dynamic eplb manager
# during initialization in offline mode, so the online mode is used instead.
env_dict.update({"DYNAMIC_EPLB": "true"})
additional_config = {
"dynamic_eplb": True,
"num_iterations_eplb_update": 100,
"num_wait_worker_iterations": 20
}
server_args.extend(["--additional-config", json.dumps(additional_config)])
with RemoteOpenAIServer(model,
server_args,
server_port=port,
auto_port=False,
env_dict=env_dict) as server:
client = server.get_async_client()
batch = await client.completions.create(model=model,
prompt="What is deeplearning?",
max_tokens=300,
temperature=0,
top_p=1.0,
n=1)
eplb_choices: list[openai.types.CompletionChoice] = batch.choices
assert gt_choices[0].text == eplb_choices[
0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"

View File

@@ -0,0 +1,93 @@
import os
import pytest
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
MODELS = [
"deepseek-ai/DeepSeek-V2-Lite",
]
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
@pytest.mark.parametrize("model", MODELS)
def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None:
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
del os.environ['HCCL_OP_EXPANSION_MODE']
prompts = [
"Hello, my name is", "The capital of the United States is",
"The capital of France is", "The future of AI is"
]
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=True,
tensor_parallel_size=2,
enable_expert_parallel=True,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
os.environ["VLLM_ASCEND_ENABLE_FLASHCOMM1"] = "1"
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=True,
tensor_parallel_size=2,
enable_expert_parallel=True,
additional_config={
"enable_shared_expert_dp": True,
},
) as runner:
shared_expert_dp_eager_outputs = runner.model.generate(
prompts, sampling_params)
with VllmRunner(
model,
max_model_len=1024,
tensor_parallel_size=2,
enable_expert_parallel=True,
compilation_config={
"cudagraph_capture_sizes": [1, 4, 8, 16],
"cudagraph_mode": "FULL_DECODE_ONLY",
},
additional_config={
"enable_shared_expert_dp": True,
},
) as runner:
shared_expert_dp_aclgraph_outputs = runner.model.generate(
prompts, sampling_params)
vllm_eager_outputs_list = []
for output in vllm_eager_outputs:
vllm_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
shared_expert_dp_eager_outputs_list = []
for output in shared_expert_dp_eager_outputs:
shared_expert_dp_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
shared_expert_dp_aclgraph_outputs_list = []
for output in shared_expert_dp_aclgraph_outputs:
shared_expert_dp_aclgraph_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,
outputs_1_lst=shared_expert_dp_eager_outputs_list,
name_0="vllm_eager_outputs",
name_1="shared_expert_dp_eager_outputs",
)
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,
outputs_1_lst=shared_expert_dp_aclgraph_outputs_list,
name_0="vllm_eager_outputs",
name_1="shared_expert_dp_aclgraph_outputs",
)

View File

@@ -0,0 +1,85 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import asyncio
from typing import Any
import openai
import pytest
from vllm.utils.network_utils import get_open_port
from tests.e2e.conftest import RemoteOpenAIServer
MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
DATA_PARALLELS = [2]
prompts = [
"San Francisco is a",
]
api_keyword_args = {
"max_tokens": 10,
}
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
async def test_models_single_request_aclgraph_dp2(model: str,
dp_size: int) -> None:
port = get_open_port()
env_dict = {
"TASK_QUEUE_ENABLE": "1",
"HCCL_OP_EXPANSION_MODE": "AIV",
}
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
server_args = [
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
"--data-parallel-size",
str(dp_size), "--quantization", "ascend", "--max-model-len",
"1024", "--port",
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
]
else:
server_args = [
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
"--data-parallel-size",
str(dp_size), "--port",
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
vllm_serve_args=server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
try:
batch = await asyncio.wait_for(client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
),
timeout=10.0)
except asyncio.TimeoutError:
pytest.fail("Model did not return response within 10 seconds")
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"