[Lint]Style: Convert test/ to ruff format(Batch #1) (#6738)

### What this PR does / why we need it?
**Scope of Changes**:
| File Path |
| :--- |
| `tests/e2e/310p/multicard/test_vl_model_multicard.py` |
| `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` |
| `tests/e2e/310p/test_utils.py` |
| `tests/e2e/conftest.py` |
| `tests/e2e/model_utils.py` |
| `tests/e2e/models/conftest.py` |
| `tests/e2e/models/test_lm_eval_correctness.py` |
| `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` |
| `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` |
| `tests/e2e/multicard/2-cards/test_data_parallel.py` |
| `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` |
| `tests/e2e/multicard/2-cards/test_expert_parallel.py` |
| `tests/e2e/multicard/2-cards/test_external_launcher.py` |
| `tests/e2e/multicard/2-cards/test_full_graph_mode.py` |
| `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` |
| `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` |
| `tests/e2e/multicard/2-cards/test_offline_weight_load.py` |
| `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` |
| `tests/e2e/multicard/2-cards/test_prefix_caching.py` |
| `tests/e2e/multicard/2-cards/test_quantization.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_performance.py` |
| `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` |
| `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` |
| `tests/e2e/multicard/2-cards/test_sp_pass.py` |

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
9562912cea

Signed-off-by: MrZ20 <2609716663@qq.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
SILONG ZENG
2026-03-10 09:52:50 +08:00
committed by GitHub
parent 9216e1b050
commit 43df2cb2fc
27 changed files with 753 additions and 859 deletions

View File

@@ -18,15 +18,12 @@
from __future__ import annotations
import math
import os
import random
from typing import Any, Union
from unittest.mock import patch
import pytest
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from vllm import SamplingParams
from vllm.config import CompilationConfig
from vllm.v1.metrics.reader import Counter, Vector
@@ -101,7 +98,8 @@ def test_eagle3_sp_acceptance(
[prompt],
tokenize=False,
add_generation_prompt=True,
) for prompt in prompts
)
for prompt in prompts
]
speculative_config = {
@@ -112,21 +110,20 @@ def test_eagle3_sp_acceptance(
"model": spec_model_name,
}
compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY",
cudagraph_capture_sizes=[12])
compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY", cudagraph_capture_sizes=[12])
with VllmRunner(
main_model_name,
enforce_eager=True,
max_model_len=8192,
disable_log_stats=False,
tensor_parallel_size=2,
max_num_seqs=256,
distributed_executor_backend="mp",
gpu_memory_utilization=0.7,
speculative_config=speculative_config,
compilation_config=compilation_config,
async_scheduling=async_scheduling,
main_model_name,
enforce_eager=True,
max_model_len=8192,
disable_log_stats=False,
tensor_parallel_size=2,
max_num_seqs=256,
distributed_executor_backend="mp",
gpu_memory_utilization=0.7,
speculative_config=speculative_config,
compilation_config=compilation_config,
async_scheduling=async_scheduling,
) as llm:
_ = llm.generate(prompts, sampling_params)
metrics = llm.model.get_metrics()
@@ -142,10 +139,7 @@ def test_eagle3_sp_acceptance(
for pos in range(len(metric.values)):
num_accepted_tokens_per_pos[pos] += metric.values[pos]
acceptance_per_pos = [
num_accepted_tokens / num_drafts
for num_accepted_tokens in num_accepted_tokens_per_pos
]
acceptance_per_pos = [num_accepted_tokens / num_drafts for num_accepted_tokens in num_accepted_tokens_per_pos]
golden = BASELINES_SP[method]
match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))

View File

@@ -25,8 +25,8 @@ import pytest
import torch
from vllm.utils.network_utils import get_open_port
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
from tests.e2e.conftest import wait_until_npu_memory_free
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
MODELS = [
# Offline data parallel mode will be not supported/useful for dense models
@@ -58,8 +58,7 @@ def _install_spies(counters: dict[str, Any]) -> contextlib.ExitStack:
]
for cls, method, counter in hooks:
stack.enter_context(
patch.object(cls, method, make_spy(cls, method, counter)))
stack.enter_context(patch.object(cls, method, make_spy(cls, method, counter)))
return stack
@@ -75,18 +74,19 @@ def _run_worker_process(
max_tokens: int,
):
"""Main entry point for the worker process."""
os.environ.update({
"VLLM_DP_RANK": str(rank),
"VLLM_DP_RANK_LOCAL": str(local_rank),
"VLLM_DP_SIZE": str(world_size),
"VLLM_DP_MASTER_IP": master_ip,
"VLLM_DP_MASTER_PORT": str(master_port),
})
os.environ.update(
{
"VLLM_DP_RANK": str(rank),
"VLLM_DP_RANK_LOCAL": str(local_rank),
"VLLM_DP_SIZE": str(world_size),
"VLLM_DP_MASTER_IP": master_ip,
"VLLM_DP_MASTER_PORT": str(master_port),
}
)
# Import vLLM only after environment setup
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import (
destroy_distributed_environment, destroy_model_parallel)
from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
# Apply hooks and run inference
with _install_spies(counters):
@@ -100,23 +100,20 @@ def _run_worker_process(
# Simple data sharding
chunk_size = len(prompts) // world_size
start_idx = rank * chunk_size
end_idx = start_idx + chunk_size if rank < world_size - 1 else len(
prompts)
end_idx = start_idx + chunk_size if rank < world_size - 1 else len(prompts)
local_prompts = prompts[start_idx:end_idx]
llm = LLM(
model=model_path,
quantization="ascend" if "W8A8" in model_path else None,
enable_expert_parallel=True if "DeepSeek" in model_path else False,
enable_expert_parallel="DeepSeek" in model_path,
trust_remote_code=True,
)
# Expose model config to the main test process
counters["hidden_layers"].value = (
llm.llm_engine.model_config.hf_text_config.num_hidden_layers)
counters["hidden_layers"].value = llm.llm_engine.model_config.hf_text_config.num_hidden_layers
llm.generate(local_prompts,
SamplingParams(max_tokens=max_tokens, temperature=0.0))
llm.generate(local_prompts, SamplingParams(max_tokens=max_tokens, temperature=0.0))
# Explicit cleanup is mandatory in multi-process vLLM tests
del llm
@@ -162,8 +159,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
for rank in range(dp_size):
p = multiprocessing.Process(
target=_run_worker_process,
args=(rank, rank, dp_size, "127.0.0.1", port, counters, model,
max_tokens),
args=(rank, rank, dp_size, "127.0.0.1", port, counters, model, max_tokens),
)
p.start()
workers.append(p)
@@ -175,8 +171,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
for k in workers:
if k.is_alive():
k.kill()
raise RuntimeError(
f"Worker {p.pid} failed with exit code {p.exitcode}")
raise RuntimeError(f"Worker {p.pid} failed with exit code {p.exitcode}")
actual_capture = counters["capture"].value
actual_replay = counters["replay"].value
@@ -185,18 +180,16 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
num_layers = counters["hidden_layers"].value
num_acl_graphs = num_layers + 1
num_comm_groups = sum(1 for s in [dp_size, 1]
if s > 1) # dp_size=2, tp_size=1
num_comm_groups = sum(1 for s in [dp_size, 1] if s > 1) # dp_size=2, tp_size=1
# Metric 1: Graph Capture (ACL Graph Construction)
# Ref: vllm_ascend.utils.update_aclgraph_sizes
max_batch_sizes = math.floor((1800 - num_comm_groups * 40) /
num_acl_graphs / (1 + num_comm_groups * 2))
max_batch_sizes = math.floor((1800 - num_comm_groups * 40) / num_acl_graphs / (1 + num_comm_groups * 2))
expected_capture = max_batch_sizes * num_acl_graphs * dp_size
assert (
actual_capture == expected_capture
), f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
assert actual_capture == expected_capture, (
f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
)
# Metric 2: Model Execution (NPUModelRunner.execute_model)
# vLLM Step Breakdown:
@@ -207,9 +200,9 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
# vllm default enables Async scheduler, this will take 1 more steps
expected_exec_model = (total_steps + 1 + 1) * dp_size
assert (
num_execute_model == expected_exec_model
), f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
assert num_execute_model == expected_exec_model, (
f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
)
# Metric 3: Dummy Runs (Warmup & Alignment)
# vLLM synchronizes globally every 32 steps.
@@ -228,14 +221,12 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
expected_dummy_run = (warmup_runs + padding_runs) * dp_size
assert (
num_dummy_run == expected_dummy_run
), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
assert num_dummy_run == expected_dummy_run, (
f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
)
# Metric 4: Graph Replay (Inference Execution)
# Replays happen for every aligned step across all graphs.
expected_replay = num_acl_graphs * aligned_steps * dp_size
assert (
actual_replay == expected_replay
), f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
assert actual_replay == expected_replay, f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"

View File

@@ -64,12 +64,8 @@ def test_qwen3_inference_dp2(model, max_tokens):
cmd.append("ascend")
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600)
output = proc.stdout.decode(errors='ignore')
proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, timeout=600)
output = proc.stdout.decode(errors="ignore")
print(output)

View File

@@ -27,6 +27,7 @@ MODELS = [
SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
TENSOR_PARALLELS = [1]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@@ -36,36 +37,61 @@ async def test_models(model: str, tp_size: int) -> None:
vllm_server_args = [
[
"--port",
str(encode_port), "--model", model, "--gpu-memory-utilization",
"0.01", "--tensor-parallel-size",
str(tp_size), "--enforce-eager", "--no-enable-prefix-caching",
"--max-model-len", "10000", "--max-num-batched-tokens", "10000",
"--max-num-seqs", "1", "--ec-transfer-config",
'{"ec_connector_extra_config":{"shared_storage_path":"' +
SHARED_STORAGE_PATH +
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
str(encode_port),
"--model",
model,
"--gpu-memory-utilization",
"0.01",
"--tensor-parallel-size",
str(tp_size),
"--enforce-eager",
"--no-enable-prefix-caching",
"--max-model-len",
"10000",
"--max-num-batched-tokens",
"10000",
"--max-num-seqs",
"1",
"--ec-transfer-config",
'{"ec_connector_extra_config":{"shared_storage_path":"'
+ SHARED_STORAGE_PATH
+ '"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}',
],
[
"--port",
str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95",
str(pd_port),
"--model",
model,
"--gpu-memory-utilization",
"0.95",
"--tensor-parallel-size",
str(tp_size), "--enforce-eager", "--max-model-len", "10000",
"--max-num-batched-tokens", "10000", "--max-num-seqs", "128",
str(tp_size),
"--enforce-eager",
"--max-model-len",
"10000",
"--max-num-batched-tokens",
"10000",
"--max-num-seqs",
"128",
"--ec-transfer-config",
'{"ec_connector_extra_config":{"shared_storage_path":"' +
SHARED_STORAGE_PATH +
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
]
'{"ec_connector_extra_config":{"shared_storage_path":"'
+ SHARED_STORAGE_PATH
+ '"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}',
],
]
proxy_port = get_open_port()
proxy_args = [
"--host", "127.0.0.1", "--port",
str(proxy_port), "--encode-servers-urls",
f"http://localhost:{encode_port}", "--decode-servers-urls",
f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable"
"--host",
"127.0.0.1",
"--port",
str(proxy_port),
"--encode-servers-urls",
f"http://localhost:{encode_port}",
"--decode-servers-urls",
f"http://localhost:{pd_port}",
"--prefill-servers-urls",
"disable",
]
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _:
with DisaggEpdProxy(proxy_args=proxy_args) as proxy:
send_image_request(model, proxy)
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _, DisaggEpdProxy(proxy_args=proxy_args) as proxy:
send_image_request(model, proxy)

View File

@@ -15,15 +15,12 @@ def test_deepseek_correctness_ep(model_name):
max_tokens = 5
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
with VllmRunner(model_name,
cudagraph_capture_sizes=[1, 2, 4, 8],
tensor_parallel_size=2) as vllm_model:
with VllmRunner(model_name, cudagraph_capture_sizes=[1, 2, 4, 8], tensor_parallel_size=2) as vllm_model:
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(model_name,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
enable_expert_parallel=True) as vllm_model:
with VllmRunner(
model_name, tensor_parallel_size=2, cudagraph_capture_sizes=[1, 2, 4, 8], enable_expert_parallel=True
) as vllm_model:
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(

View File

@@ -29,6 +29,7 @@ from unittest.mock import patch
import pytest
import torch_npu
from modelscope import snapshot_download # type: ignore
from tests.e2e.conftest import wait_until_npu_memory_free
MODELS = ["Qwen/Qwen3-0.6B"]
@@ -39,9 +40,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@pytest.mark.parametrize("model", MODELS)
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"})
def test_qwen3_external_launcher(model):
script = Path(
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
@@ -68,7 +67,7 @@ def test_qwen3_external_launcher(model):
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode(errors='ignore')
output = proc.stdout.decode(errors="ignore")
print(output)
@@ -81,16 +80,24 @@ def test_qwen3_external_launcher(model):
@pytest.mark.parametrize("model", MOE_MODELS)
@wait_until_npu_memory_free()
def test_qwen3_moe_external_launcher_ep_tp2(model):
script = Path(
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script), "--model", model, "--tp-size", "2", "--node-size", "1",
"--node-rank", "0", "--proc-per-node", "2", "--trust-remote-code",
"--enable-expert-parallel"
str(script),
"--model",
model,
"--tp-size",
"2",
"--node-size",
"1",
"--node-rank",
"0",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enable-expert-parallel",
]
print(f"Running subprocess: {' '.join(cmd)}")
@@ -101,7 +108,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode(errors='ignore')
output = proc.stdout.decode(errors="ignore")
print(output)
@@ -113,9 +120,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
@wait_until_npu_memory_free()
def test_qwen3_external_launcher_with_sleepmode():
script = Path(
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
@@ -147,7 +152,7 @@ def test_qwen3_external_launcher_with_sleepmode():
stderr=subprocess.STDOUT,
timeout=300,
)
output = proc.stdout.decode(errors='ignore')
output = proc.stdout.decode(errors="ignore")
print(output)
@@ -158,9 +163,7 @@ def test_qwen3_external_launcher_with_sleepmode():
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
def test_qwen3_external_launcher_with_sleepmode_level2():
script = Path(
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
model_path = snapshot_download("Qwen/Qwen3-8B")
# TODO: Add moe model test
@@ -195,7 +198,7 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
stderr=subprocess.STDOUT,
timeout=300,
)
output = proc.stdout.decode(errors='ignore')
output = proc.stdout.decode(errors="ignore")
print(output)
@@ -210,14 +213,9 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
)
@pytest.mark.parametrize("model", MODELS)
@wait_until_npu_memory_free()
@patch.dict(os.environ, {
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1",
"HCCL_BUFFSIZE": "500"
})
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1", "HCCL_BUFFSIZE": "500"})
def test_qwen3_external_launcher_with_matmul_allreduce(model):
script = Path(
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
cmd = [
sys.executable,
@@ -236,7 +234,7 @@ def test_qwen3_external_launcher_with_matmul_allreduce(model):
timeout=600,
)
output = proc.stdout.decode(errors='ignore')
output = proc.stdout.decode(errors="ignore")
print(output)
assert "Generated text:" in output

View File

@@ -26,41 +26,39 @@ from tests.e2e.model_utils import check_outputs_equal
def test_qwen3_moe_full_decode_only_tp2():
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
del os.environ['HCCL_OP_EXPANSION_MODE']
if "HCCL_OP_EXPANSION_MODE" in os.environ:
del os.environ["HCCL_OP_EXPANSION_MODE"]
prompts = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
model = "Qwen/Qwen3-30B-A3B"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
}) as runner:
vllm_fullgraph_outputs = runner.model.generate(prompts,
sampling_params)
with VllmRunner(
model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
) as runner:
vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)
with VllmRunner(
model,
max_model_len=1024,
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
tensor_parallel_size=2,
model,
max_model_len=1024,
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
tensor_parallel_size=2,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
vllm_fullgraph_outputs_list = []
for output in vllm_fullgraph_outputs:
vllm_fullgraph_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list = []
for output in vllm_eager_outputs:
vllm_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,
@@ -72,41 +70,39 @@ def test_qwen3_moe_full_decode_only_tp2():
@pytest.mark.skip(reason="CANN8.5 failed with this test, fix me")
def test_qwen3_moe_full_graph_tp2():
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
del os.environ['HCCL_OP_EXPANSION_MODE']
if "HCCL_OP_EXPANSION_MODE" in os.environ:
del os.environ["HCCL_OP_EXPANSION_MODE"]
prompts = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
model = "Qwen/Qwen3-30B-A3B"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={
"cudagraph_mode": "FULL",
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
}) as runner:
vllm_fullgraph_outputs = runner.model.generate(prompts,
sampling_params)
with VllmRunner(
model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={"cudagraph_mode": "FULL", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
) as runner:
vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)
with VllmRunner(
model,
max_model_len=1024,
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
tensor_parallel_size=2,
model,
max_model_len=1024,
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
tensor_parallel_size=2,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
vllm_fullgraph_outputs_list = []
for output in vllm_fullgraph_outputs:
vllm_fullgraph_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list = []
for output in vllm_eager_outputs:
vllm_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,

View File

@@ -1,23 +1,22 @@
import pytest
from tests.e2e.conftest import VllmRunner
from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
MODEL_PATH, do_sample)
from tests.e2e.singlecard.test_ilama_lora import EXPECTED_LORA_OUTPUT, MODEL_PATH, do_sample
@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
with VllmRunner(
MODEL_PATH,
enable_lora=True,
max_loras=4,
dtype="half",
max_model_len=1024,
max_num_seqs=16,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
MODEL_PATH,
enable_lora=True,
max_loras=4,
dtype="half",
max_model_len=1024,
max_num_seqs=16,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
) as vllm_model:
output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)

View File

@@ -20,8 +20,10 @@
Run `pytest tests/test_offline_inference.py`.
"""
import os
from unittest.mock import patch
import pytest
from vllm import SamplingParams
@@ -51,6 +53,7 @@ GPT_OSS_MODELS = [
"unsloth/gpt-oss-20b-BF16",
]
def test_deepseek_multistream_moe_tp2():
example_prompts = [
"Hello, my name is",
@@ -58,15 +61,15 @@ def test_deepseek_multistream_moe_tp2():
dtype = "half"
max_tokens = 5
with VllmRunner(
"vllm-ascend/DeepSeek-V3-Pruning",
dtype=dtype,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend="mp",
additional_config={
"enable_multistream_moe": True,
"refresh": True,
},
"vllm-ascend/DeepSeek-V3-Pruning",
dtype=dtype,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend="mp",
additional_config={
"enable_multistream_moe": True,
"refresh": True,
},
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -78,12 +81,12 @@ def test_qwen3_w4a8_dynamic_tp2(model):
]
max_tokens = 5
with VllmRunner(
model,
max_model_len=8192,
dtype="auto",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
model,
max_model_len=8192,
dtype="auto",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(prompts, max_tokens)
@@ -92,20 +95,17 @@ def test_qwen3_moe_sp_tp2() -> None:
example_prompts = [
"Hello, my name is",
]
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
with VllmRunner("Qwen/Qwen3-30B-A3B",
dtype="auto",
tensor_parallel_size=2,
distributed_executor_backend="mp",
compilation_config={"pass_config": {
"enable_sp": True
}},
enable_expert_parallel=True,
enforce_eager=True) as vllm_model:
with VllmRunner(
"Qwen/Qwen3-30B-A3B",
dtype="auto",
tensor_parallel_size=2,
distributed_executor_backend="mp",
compilation_config={"pass_config": {"enable_sp": True}},
enable_expert_parallel=True,
enforce_eager=True,
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@@ -113,33 +113,34 @@ def test_qwen3_moe_sp_tp2() -> None:
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
def test_deepseek_w4a8_accuracy_tp2(model):
prompts = [
"Hello, my name is", "The president of the United States is",
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs"
]
vllm_ds_w4a8_answers = [
'逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
"Hello, my name is",
"The president of the United States is",
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs",
]
vllm_ds_w4a8_answers = ["逍遙而至地去 accrued", "平行于我udo madreHelen", "ysteepaolis backwards Kj"]
sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
with VllmRunner(model,
dtype="auto",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
enable_expert_parallel=True) as vllm_model:
vllm_quant_outputs = vllm_model.model.generate(prompts,
sampling_params)
with VllmRunner(
model,
dtype="auto",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
enable_expert_parallel=True,
) as vllm_model:
vllm_quant_outputs = vllm_model.model.generate(prompts, sampling_params)
vllm_quant_outputs_list = []
for output in vllm_quant_outputs:
vllm_quant_outputs_list.append(
([output.outputs[0].index], output.outputs[0].text))
vllm_quant_outputs_list.append(([output.outputs[0].index], output.outputs[0].text))
vllm_answer_list = []
vllm_answer_list = ([([0], answer) for answer in vllm_ds_w4a8_answers])
vllm_answer_list = [([0], answer) for answer in vllm_ds_w4a8_answers]
check_outputs_equal(outputs_0_lst=vllm_answer_list,
outputs_1_lst=vllm_quant_outputs_list,
name_0="vllm_quant_outputs",
name_1="vllm_answer_outputs")
check_outputs_equal(
outputs_0_lst=vllm_answer_list,
outputs_1_lst=vllm_quant_outputs_list,
name_0="vllm_quant_outputs",
name_1="vllm_answer_outputs",
)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
@@ -148,17 +149,16 @@ def test_qwen3_moe_fc2_tp2() -> None:
example_prompts = [
"Hello, my name is",
]
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
with VllmRunner("Qwen/Qwen3-30B-A3B",
dtype="auto",
tensor_parallel_size=2,
distributed_executor_backend="mp",
enable_expert_parallel=True,
enforce_eager=True) as vllm_model:
with VllmRunner(
"Qwen/Qwen3-30B-A3B",
dtype="auto",
tensor_parallel_size=2,
distributed_executor_backend="mp",
enable_expert_parallel=True,
enforce_eager=True,
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@@ -168,20 +168,17 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None:
example_prompts = [
"Hello, my name is",
]
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
with VllmRunner(
"Qwen/Qwen3-30B-A3B",
dtype="auto",
tensor_parallel_size=2,
distributed_executor_backend="mp",
enable_expert_parallel=True,
enforce_eager=
True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen
additional_config={"layer_sharding": ["o_proj"]}) as vllm_model:
"Qwen/Qwen3-30B-A3B",
dtype="auto",
tensor_parallel_size=2,
distributed_executor_backend="mp",
enable_expert_parallel=True,
enforce_eager=True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen
additional_config={"layer_sharding": ["o_proj"]},
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@@ -190,17 +187,16 @@ def test_deepseek_v2_lite_fc1_tp2() -> None:
example_prompts = [
"test" * 1001,
]
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)
with VllmRunner("vllm-ascend/DeepSeek-V2-Lite-W8A8",
dtype="auto",
tensor_parallel_size=2,
distributed_executor_backend="mp",
enable_expert_parallel=True,
enforce_eager=True,
quantization="ascend") as vllm_model:
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
with VllmRunner(
"vllm-ascend/DeepSeek-V2-Lite-W8A8",
dtype="auto",
tensor_parallel_size=2,
distributed_executor_backend="mp",
enable_expert_parallel=True,
enforce_eager=True,
quantization="ascend",
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@@ -213,12 +209,12 @@ def test_qwen3_dense_fc1_tp2(model):
max_tokens = 5
with VllmRunner(
model,
max_model_len=8192,
dtype="auto",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
model,
max_model_len=8192,
dtype="auto",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -232,13 +228,13 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
max_tokens = 5
with VllmRunner(
model,
max_model_len=8192,
dtype="auto",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
additional_config={"weight_prefetch_config": {"enabled": True}},
model,
max_model_len=8192,
dtype="auto",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
additional_config={"weight_prefetch_config": {"enabled": True}},
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -252,28 +248,20 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
"Hello ",
]
# "max_position_embeddings": 163840,
long_example_prompts = [
"Hello " * (163839 - 500) + "Hello"
]
long_example_prompts = ["Hello " * (163839 - 500) + "Hello"]
max_tokens = 500
with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
tensor_parallel_size=2,
quantization="ascend",
enable_expert_parallel=True,
max_model_len=163840,
compilation_config={
"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12],
"cudagraph_mode": "FULL_DECODE_ONLY"
},
speculative_config={
"num_speculative_tokens": 1,
"method": "deepseek_mtp"
},
additional_config={
"layer_sharding":["q_b_proj", "o_proj"]
},
reasoning_parser="deepseek_v3",
tokenizer_mode="deepseek_v32") as vllm_model:
with VllmRunner(
"vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
tensor_parallel_size=2,
quantization="ascend",
enable_expert_parallel=True,
max_model_len=163840,
compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
additional_config={"layer_sharding": ["q_b_proj", "o_proj"]},
reasoning_parser="deepseek_v3",
tokenizer_mode="deepseek_v32",
) as vllm_model:
vllm_model.generate_greedy(short_example_prompts, max_tokens)
vllm_model.generate_greedy(long_example_prompts, max_tokens)
@@ -285,10 +273,10 @@ def test_qwen3_w4a4_distributed_tp2(model):
]
max_tokens = 5
with VllmRunner(
model,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
model,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -300,8 +288,8 @@ def test_gpt_oss_distributed_tp2(model):
]
max_tokens = 5
with VllmRunner(
model,
tensor_parallel_size=2,
enforce_eager=True,
model,
tensor_parallel_size=2,
enforce_eager=True,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)

View File

@@ -32,9 +32,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B"]
@pytest.mark.parametrize("model", MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
def test_qwen3_offline_load_and_sleepmode_tp2(model):
script = Path(
__file__
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
cmd = [
sys.executable,
@@ -65,7 +63,7 @@ def test_qwen3_offline_load_and_sleepmode_tp2(model):
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode(errors='ignore')
output = proc.stdout.decode(errors="ignore")
print(output)

View File

@@ -37,12 +37,13 @@ prompts = [
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND)
def test_models_pp2(model: str, tp_size: int, pp_size: int,
distributed_executor_backend: str) -> None:
with VllmRunner(model,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend=distributed_executor_backend,
gpu_memory_utilization=0.7) as vllm_model:
def test_models_pp2(model: str, tp_size: int, pp_size: int, distributed_executor_backend: str) -> None:
with VllmRunner(
model,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend=distributed_executor_backend,
gpu_memory_utilization=0.7,
) as vllm_model:
vllm_model.generate_greedy(prompts, 64)

View File

@@ -11,11 +11,14 @@ MODELS = [
# for MHA
"Qwen/Qwen3-8B",
# for MLA
"deepseek-ai/DeepSeek-V2-Lite-Chat"
"deepseek-ai/DeepSeek-V2-Lite-Chat",
]
# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
# ruff: noqa: E501
LONG_PROMPT = (
"You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
+ """
| ID | Name | Age | Occupation | Country | Email | Phone Number | Address |
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL |
@@ -49,32 +52,34 @@ LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables i
| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ |
| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE |
"""
)
INPUT_PROMPTS = [
LONG_PROMPT +
"Question: what is the age of John Doe? Your answer: The age of John Doe is ",
LONG_PROMPT +
"Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is "
LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50])
def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None:
with VllmRunner(model,
max_model_len=2048,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7) as vllm_model:
prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)
with VllmRunner(
model,
max_model_len=2048,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7,
) as vllm_model:
prefix_cache_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
with VllmRunner(model,
enable_prefix_caching=False,
max_model_len=2048,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7) as vllm_model:
with VllmRunner(
model,
enable_prefix_caching=False,
max_model_len=2048,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.7,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
check_outputs_equal(

View File

@@ -16,7 +16,6 @@
# This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
#
import pytest
from tests.e2e.conftest import VllmRunner
@@ -27,16 +26,16 @@ def test_qwen2_5_w8a8_external_quantized_tp2():
]
max_tokens = 5
with VllmRunner(
"neuralmagic/Qwen2.5-3B-quantized.w8a8",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
max_model_len=4096,
gpu_memory_utilization=0.8,
"neuralmagic/Qwen2.5-3B-quantized.w8a8",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
max_model_len=4096,
gpu_memory_utilization=0.8,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
golden_results = [
'The president of the United States is the head of state and',
"The president of the United States is the head of state and",
]
for i in range(len(vllm_output)):
@@ -50,36 +49,37 @@ def test_qwen3_moe_w8a8_dynamic_llm_compressor():
]
max_tokens = 5
with VllmRunner(
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
tensor_parallel_size=2,
max_model_len=4096,
gpu_memory_utilization=0.8,
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
tensor_parallel_size=2,
max_model_len=4096,
gpu_memory_utilization=0.8,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
golden_results = [
'The president of the United States is the head of state and',
"The president of the United States is the head of state and",
]
for i in range(len(vllm_output)):
assert golden_results[i] == vllm_output[i][1]
print(f"Generated text: {vllm_output[i][1]!r}")
def test_qwen3_moe_w4a8_dynamic_llm_compressor():
example_prompts = [
"The president of the United States is",
]
max_tokens = 5
with VllmRunner(
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
tensor_parallel_size=2,
max_model_len=4096,
gpu_memory_utilization=0.8,
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
tensor_parallel_size=2,
max_model_len=4096,
gpu_memory_utilization=0.8,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
golden_results = [
'The president of the United States is the head of state and',
"The president of the United States is the head of state and",
]
for i in range(len(vllm_output)):

View File

@@ -34,11 +34,11 @@ def test_qwen3_moe_distributed_mp_tp2_ep():
]
max_tokens = 5
with VllmRunner(
"Qwen/Qwen3-30B-A3B",
tensor_parallel_size=2,
enable_expert_parallel=True,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend="mp",
"Qwen/Qwen3-30B-A3B",
tensor_parallel_size=2,
enable_expert_parallel=True,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend="mp",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -49,27 +49,27 @@ def test_qwen3_moe_w8a8_distributed_tp2():
]
max_tokens = 5
with VllmRunner(
"vllm-ascend/Qwen3-30B-A3B-W8A8",
max_model_len=8192,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
"vllm-ascend/Qwen3-30B-A3B-W8A8",
max_model_len=8192,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
def test_qwen3_moe_distributed_aiv_tp2():
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
os.environ["HCCL_OP_EXPANSION_MODE"] = "AIV"
example_prompts = [
"Hello, my name is",
]
dtype = "auto"
max_tokens = 5
with VllmRunner(
"Qwen/Qwen3-30B-A3B",
dtype=dtype,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
"Qwen/Qwen3-30B-A3B",
dtype=dtype,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -80,23 +80,24 @@ async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
port = get_open_port()
compilation_config = json.dumps({"cudagraph_capture_sizes": [8]})
server_args = [
"--max_model_len", "8192", "--tensor_parallel_size", "2",
"--enable_expert_parallel", "--quantization", "ascend", "--port",
str(port), "--compilation-config", compilation_config
"--max_model_len",
"8192",
"--tensor_parallel_size",
"2",
"--enable_expert_parallel",
"--quantization",
"ascend",
"--port",
str(port),
"--compilation-config",
compilation_config,
]
env_dict = {"HCCL_BUFFSIZE": "1024"}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
auto_port=False,
env_dict=env_dict) as server:
with RemoteOpenAIServer(model, server_args, server_port=port, auto_port=False, env_dict=env_dict) as server:
client = server.get_async_client()
batch = await client.completions.create(model=model,
prompt="What is deeplearning?",
max_tokens=400,
temperature=0,
top_p=1.0,
n=1)
batch = await client.completions.create(
model=model, prompt="What is deeplearning?", max_tokens=400, temperature=0, top_p=1.0, n=1
)
gt_choices: list[openai.types.CompletionChoice] = batch.choices
# dynamic eplb test
@@ -108,22 +109,14 @@ async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
"dynamic_eplb": True,
"expert_heat_collection_interval": 100,
"algorithm_execution_interval": 20,
"num_redundant_experts": 2
"num_redundant_experts": 2,
}
}
server_args.extend(["--additional-config", json.dumps(additional_config)])
with RemoteOpenAIServer(model,
server_args,
server_port=port,
auto_port=False,
env_dict=env_dict) as server:
with RemoteOpenAIServer(model, server_args, server_port=port, auto_port=False, env_dict=env_dict) as server:
client = server.get_async_client()
batch = await client.completions.create(model=model,
prompt="What is deeplearning?",
max_tokens=400,
temperature=0,
top_p=1.0,
n=1)
batch = await client.completions.create(
model=model, prompt="What is deeplearning?", max_tokens=400, temperature=0, top_p=1.0, n=1
)
eplb_choices: list[openai.types.CompletionChoice] = batch.choices
assert gt_choices[0].text == eplb_choices[
0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"
assert gt_choices[0].text == eplb_choices[0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"

View File

@@ -1,10 +1,11 @@
import os
from unittest.mock import patch
from tests.e2e.conftest import VllmRunner
from vllm import SamplingParams
from vllm.sampling_params import RequestOutputKind
from tests.e2e.conftest import VllmRunner
@patch.dict(os.environ, {"OMP_NUM_THREADS": "1"})
def test_qwen3_moe_routing_replay():
@@ -12,18 +13,15 @@ def test_qwen3_moe_routing_replay():
"Hello, please introduce yourself.",
]
with VllmRunner(
"Qwen/Qwen3-30B-A3B",
tensor_parallel_size=2,
enable_expert_parallel=True,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend="mp",
enable_return_routed_experts=True,
"Qwen/Qwen3-30B-A3B",
tensor_parallel_size=2,
enable_expert_parallel=True,
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend="mp",
enable_return_routed_experts=True,
) as vllm_model:
sampling_params = SamplingParams(
max_tokens=5,
temperature=0.8,
top_p=0.95,
output_kind=RequestOutputKind.FINAL_ONLY
max_tokens=5, temperature=0.8, top_p=0.95, output_kind=RequestOutputKind.FINAL_ONLY
)
inputs = vllm_model.get_inputs(prompts=prompts)
outputs = vllm_model.model.generate(prompts=inputs, sampling_params=sampling_params)

View File

@@ -84,11 +84,7 @@ async def test_models(model: str) -> None:
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
with RemoteOpenAIServer(model, server_args, server_port=port, env_dict=env_dict, auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,

View File

@@ -13,69 +13,65 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS)
def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None:
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
del os.environ['HCCL_OP_EXPANSION_MODE']
if "HCCL_OP_EXPANSION_MODE" in os.environ:
del os.environ["HCCL_OP_EXPANSION_MODE"]
prompts = [
"Hello, my name is", "The capital of the United States is",
"The capital of France is", "The future of AI is"
"Hello, my name is",
"The capital of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=True,
tensor_parallel_size=2,
enable_expert_parallel=True,
model,
max_model_len=1024,
enforce_eager=True,
tensor_parallel_size=2,
enable_expert_parallel=True,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
os.environ["VLLM_ASCEND_ENABLE_FLASHCOMM1"] = "1"
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=True,
tensor_parallel_size=2,
enable_expert_parallel=True,
additional_config={
"enable_shared_expert_dp": True,
},
model,
max_model_len=1024,
enforce_eager=True,
tensor_parallel_size=2,
enable_expert_parallel=True,
additional_config={
"enable_shared_expert_dp": True,
},
) as runner:
shared_expert_dp_eager_outputs = runner.model.generate(
prompts, sampling_params)
shared_expert_dp_eager_outputs = runner.model.generate(prompts, sampling_params)
with VllmRunner(
model,
max_model_len=1024,
tensor_parallel_size=2,
enable_expert_parallel=True,
compilation_config={
"cudagraph_capture_sizes": [1, 4, 8, 16],
"cudagraph_mode": "FULL_DECODE_ONLY",
},
additional_config={
"enable_shared_expert_dp": True,
},
model,
max_model_len=1024,
tensor_parallel_size=2,
enable_expert_parallel=True,
compilation_config={
"cudagraph_capture_sizes": [1, 4, 8, 16],
"cudagraph_mode": "FULL_DECODE_ONLY",
},
additional_config={
"enable_shared_expert_dp": True,
},
) as runner:
shared_expert_dp_aclgraph_outputs = runner.model.generate(
prompts, sampling_params)
shared_expert_dp_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
vllm_eager_outputs_list = []
for output in vllm_eager_outputs:
vllm_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
shared_expert_dp_eager_outputs_list = []
for output in shared_expert_dp_eager_outputs:
shared_expert_dp_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
shared_expert_dp_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
shared_expert_dp_aclgraph_outputs_list = []
for output in shared_expert_dp_aclgraph_outputs:
shared_expert_dp_aclgraph_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
shared_expert_dp_aclgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,

View File

@@ -39,8 +39,7 @@ api_keyword_args = {
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
async def test_models_single_request_aclgraph_dp2(model: str,
dp_size: int) -> None:
async def test_models_single_request_aclgraph_dp2(model: str, dp_size: int) -> None:
port = get_open_port()
env_dict = {
"TASK_QUEUE_ENABLE": "1",
@@ -48,36 +47,51 @@ async def test_models_single_request_aclgraph_dp2(model: str,
}
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
server_args = [
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
"--no-enable-prefix-caching",
"--tensor-parallel-size",
"1",
"--data-parallel-size",
str(dp_size), "--quantization", "ascend", "--max-model-len",
"1024", "--port",
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
str(dp_size),
"--quantization",
"ascend",
"--max-model-len",
"1024",
"--port",
str(port),
"--trust-remote-code",
"--gpu-memory-utilization",
"0.9",
]
else:
server_args = [
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
"--no-enable-prefix-caching",
"--tensor-parallel-size",
"1",
"--data-parallel-size",
str(dp_size), "--port",
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
str(dp_size),
"--port",
str(port),
"--trust-remote-code",
"--gpu-memory-utilization",
"0.9",
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
vllm_serve_args=server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
with RemoteOpenAIServer(
model, vllm_serve_args=server_args, server_port=port, env_dict=env_dict, auto_port=False
) as server:
client = server.get_async_client()
try:
batch = await asyncio.wait_for(client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
),
timeout=10.0)
batch = await asyncio.wait_for(
client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
),
timeout=10.0,
)
except asyncio.TimeoutError:
pytest.fail("Model did not return response within 10 seconds")

View File

@@ -1,5 +1,3 @@
import os
import pytest
from vllm import SamplingParams
@@ -14,47 +12,46 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS)
def test_qwen3_vl_sp_tp2(model: str) -> None:
prompts = [
"Hello, my name is", "The capital of the United States is",
"The capital of France is", "The future of AI is"
"Hello, my name is",
"The capital of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(max_tokens=10, temperature=0.0)
with VllmRunner(
model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={
"cudagraph_capture_sizes": [2, 4],
"cudagraph_mode": "FULL_DECODE_ONLY",
"pass_config": {"enable_sp": False}
},
additional_config={"ascend_compilation_config": {"enable_npugraph_ex": False}}
model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={
"cudagraph_capture_sizes": [2, 4],
"cudagraph_mode": "FULL_DECODE_ONLY",
"pass_config": {"enable_sp": False},
},
additional_config={"ascend_compilation_config": {"enable_npugraph_ex": False}},
) as runner:
no_sp_outputs = runner.model.generate(prompts, sampling_params)
with VllmRunner(
model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={
"cudagraph_capture_sizes": [2, 4],
"cudagraph_mode": "FULL_DECODE_ONLY",
"pass_config": {"enable_sp": True}
},
additional_config={"sp_threshold": 10, "ascend_compilation_config": {"enable_npugraph_ex": False}}
model,
max_model_len=1024,
tensor_parallel_size=2,
compilation_config={
"cudagraph_capture_sizes": [2, 4],
"cudagraph_mode": "FULL_DECODE_ONLY",
"pass_config": {"enable_sp": True},
},
additional_config={"sp_threshold": 10, "ascend_compilation_config": {"enable_npugraph_ex": False}},
) as runner:
sp_outputs = runner.model.generate(
prompts, sampling_params)
sp_outputs = runner.model.generate(prompts, sampling_params)
no_sp_outputs_list = []
for output in no_sp_outputs:
no_sp_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
no_sp_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
sp_outputs_list = []
for output in sp_outputs:
sp_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
sp_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
check_outputs_equal(
outputs_0_lst=no_sp_outputs_list,