### What this PR does / why we need it?
**Scope of Changes**:
| File Path |
| :--- |
| `tests/e2e/310p/multicard/test_vl_model_multicard.py` |
| `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` |
| `tests/e2e/310p/test_utils.py` |
| `tests/e2e/conftest.py` |
| `tests/e2e/model_utils.py` |
| `tests/e2e/models/conftest.py` |
| `tests/e2e/models/test_lm_eval_correctness.py` |
| `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` |
| `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` |
| `tests/e2e/multicard/2-cards/test_data_parallel.py` |
| `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` |
| `tests/e2e/multicard/2-cards/test_expert_parallel.py` |
| `tests/e2e/multicard/2-cards/test_external_launcher.py` |
| `tests/e2e/multicard/2-cards/test_full_graph_mode.py` |
| `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` |
| `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` |
| `tests/e2e/multicard/2-cards/test_offline_weight_load.py` |
| `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` |
| `tests/e2e/multicard/2-cards/test_prefix_caching.py` |
| `tests/e2e/multicard/2-cards/test_quantization.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_performance.py` |
| `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` |
| `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` |
| `tests/e2e/multicard/2-cards/test_sp_pass.py` |
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
9562912cea
Signed-off-by: MrZ20 <2609716663@qq.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -18,15 +18,12 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
from typing import Any, Union
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import CompilationConfig
|
||||
from vllm.v1.metrics.reader import Counter, Vector
|
||||
|
||||
@@ -101,7 +98,8 @@ def test_eagle3_sp_acceptance(
|
||||
[prompt],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
) for prompt in prompts
|
||||
)
|
||||
for prompt in prompts
|
||||
]
|
||||
|
||||
speculative_config = {
|
||||
@@ -112,21 +110,20 @@ def test_eagle3_sp_acceptance(
|
||||
"model": spec_model_name,
|
||||
}
|
||||
|
||||
compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY",
|
||||
cudagraph_capture_sizes=[12])
|
||||
compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY", cudagraph_capture_sizes=[12])
|
||||
|
||||
with VllmRunner(
|
||||
main_model_name,
|
||||
enforce_eager=True,
|
||||
max_model_len=8192,
|
||||
disable_log_stats=False,
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=256,
|
||||
distributed_executor_backend="mp",
|
||||
gpu_memory_utilization=0.7,
|
||||
speculative_config=speculative_config,
|
||||
compilation_config=compilation_config,
|
||||
async_scheduling=async_scheduling,
|
||||
main_model_name,
|
||||
enforce_eager=True,
|
||||
max_model_len=8192,
|
||||
disable_log_stats=False,
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=256,
|
||||
distributed_executor_backend="mp",
|
||||
gpu_memory_utilization=0.7,
|
||||
speculative_config=speculative_config,
|
||||
compilation_config=compilation_config,
|
||||
async_scheduling=async_scheduling,
|
||||
) as llm:
|
||||
_ = llm.generate(prompts, sampling_params)
|
||||
metrics = llm.model.get_metrics()
|
||||
@@ -142,10 +139,7 @@ def test_eagle3_sp_acceptance(
|
||||
for pos in range(len(metric.values)):
|
||||
num_accepted_tokens_per_pos[pos] += metric.values[pos]
|
||||
|
||||
acceptance_per_pos = [
|
||||
num_accepted_tokens / num_drafts
|
||||
for num_accepted_tokens in num_accepted_tokens_per_pos
|
||||
]
|
||||
acceptance_per_pos = [num_accepted_tokens / num_drafts for num_accepted_tokens in num_accepted_tokens_per_pos]
|
||||
golden = BASELINES_SP[method]
|
||||
|
||||
match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))
|
||||
|
||||
@@ -25,8 +25,8 @@ import pytest
|
||||
import torch
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
||||
from tests.e2e.conftest import wait_until_npu_memory_free
|
||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
||||
|
||||
MODELS = [
|
||||
# Offline data parallel mode will be not supported/useful for dense models
|
||||
@@ -58,8 +58,7 @@ def _install_spies(counters: dict[str, Any]) -> contextlib.ExitStack:
|
||||
]
|
||||
|
||||
for cls, method, counter in hooks:
|
||||
stack.enter_context(
|
||||
patch.object(cls, method, make_spy(cls, method, counter)))
|
||||
stack.enter_context(patch.object(cls, method, make_spy(cls, method, counter)))
|
||||
|
||||
return stack
|
||||
|
||||
@@ -75,18 +74,19 @@ def _run_worker_process(
|
||||
max_tokens: int,
|
||||
):
|
||||
"""Main entry point for the worker process."""
|
||||
os.environ.update({
|
||||
"VLLM_DP_RANK": str(rank),
|
||||
"VLLM_DP_RANK_LOCAL": str(local_rank),
|
||||
"VLLM_DP_SIZE": str(world_size),
|
||||
"VLLM_DP_MASTER_IP": master_ip,
|
||||
"VLLM_DP_MASTER_PORT": str(master_port),
|
||||
})
|
||||
os.environ.update(
|
||||
{
|
||||
"VLLM_DP_RANK": str(rank),
|
||||
"VLLM_DP_RANK_LOCAL": str(local_rank),
|
||||
"VLLM_DP_SIZE": str(world_size),
|
||||
"VLLM_DP_MASTER_IP": master_ip,
|
||||
"VLLM_DP_MASTER_PORT": str(master_port),
|
||||
}
|
||||
)
|
||||
|
||||
# Import vLLM only after environment setup
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import (
|
||||
destroy_distributed_environment, destroy_model_parallel)
|
||||
from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
|
||||
|
||||
# Apply hooks and run inference
|
||||
with _install_spies(counters):
|
||||
@@ -100,23 +100,20 @@ def _run_worker_process(
|
||||
# Simple data sharding
|
||||
chunk_size = len(prompts) // world_size
|
||||
start_idx = rank * chunk_size
|
||||
end_idx = start_idx + chunk_size if rank < world_size - 1 else len(
|
||||
prompts)
|
||||
end_idx = start_idx + chunk_size if rank < world_size - 1 else len(prompts)
|
||||
local_prompts = prompts[start_idx:end_idx]
|
||||
|
||||
llm = LLM(
|
||||
model=model_path,
|
||||
quantization="ascend" if "W8A8" in model_path else None,
|
||||
enable_expert_parallel=True if "DeepSeek" in model_path else False,
|
||||
enable_expert_parallel="DeepSeek" in model_path,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
# Expose model config to the main test process
|
||||
counters["hidden_layers"].value = (
|
||||
llm.llm_engine.model_config.hf_text_config.num_hidden_layers)
|
||||
counters["hidden_layers"].value = llm.llm_engine.model_config.hf_text_config.num_hidden_layers
|
||||
|
||||
llm.generate(local_prompts,
|
||||
SamplingParams(max_tokens=max_tokens, temperature=0.0))
|
||||
llm.generate(local_prompts, SamplingParams(max_tokens=max_tokens, temperature=0.0))
|
||||
|
||||
# Explicit cleanup is mandatory in multi-process vLLM tests
|
||||
del llm
|
||||
@@ -162,8 +159,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
for rank in range(dp_size):
|
||||
p = multiprocessing.Process(
|
||||
target=_run_worker_process,
|
||||
args=(rank, rank, dp_size, "127.0.0.1", port, counters, model,
|
||||
max_tokens),
|
||||
args=(rank, rank, dp_size, "127.0.0.1", port, counters, model, max_tokens),
|
||||
)
|
||||
p.start()
|
||||
workers.append(p)
|
||||
@@ -175,8 +171,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
for k in workers:
|
||||
if k.is_alive():
|
||||
k.kill()
|
||||
raise RuntimeError(
|
||||
f"Worker {p.pid} failed with exit code {p.exitcode}")
|
||||
raise RuntimeError(f"Worker {p.pid} failed with exit code {p.exitcode}")
|
||||
|
||||
actual_capture = counters["capture"].value
|
||||
actual_replay = counters["replay"].value
|
||||
@@ -185,18 +180,16 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
num_layers = counters["hidden_layers"].value
|
||||
|
||||
num_acl_graphs = num_layers + 1
|
||||
num_comm_groups = sum(1 for s in [dp_size, 1]
|
||||
if s > 1) # dp_size=2, tp_size=1
|
||||
num_comm_groups = sum(1 for s in [dp_size, 1] if s > 1) # dp_size=2, tp_size=1
|
||||
|
||||
# Metric 1: Graph Capture (ACL Graph Construction)
|
||||
# Ref: vllm_ascend.utils.update_aclgraph_sizes
|
||||
max_batch_sizes = math.floor((1800 - num_comm_groups * 40) /
|
||||
num_acl_graphs / (1 + num_comm_groups * 2))
|
||||
max_batch_sizes = math.floor((1800 - num_comm_groups * 40) / num_acl_graphs / (1 + num_comm_groups * 2))
|
||||
|
||||
expected_capture = max_batch_sizes * num_acl_graphs * dp_size
|
||||
assert (
|
||||
actual_capture == expected_capture
|
||||
), f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
|
||||
assert actual_capture == expected_capture, (
|
||||
f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
|
||||
)
|
||||
|
||||
# Metric 2: Model Execution (NPUModelRunner.execute_model)
|
||||
# vLLM Step Breakdown:
|
||||
@@ -207,9 +200,9 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
# vllm default enables Async scheduler, this will take 1 more steps
|
||||
expected_exec_model = (total_steps + 1 + 1) * dp_size
|
||||
|
||||
assert (
|
||||
num_execute_model == expected_exec_model
|
||||
), f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
|
||||
assert num_execute_model == expected_exec_model, (
|
||||
f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
|
||||
)
|
||||
|
||||
# Metric 3: Dummy Runs (Warmup & Alignment)
|
||||
# vLLM synchronizes globally every 32 steps.
|
||||
@@ -228,14 +221,12 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
|
||||
expected_dummy_run = (warmup_runs + padding_runs) * dp_size
|
||||
|
||||
assert (
|
||||
num_dummy_run == expected_dummy_run
|
||||
), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
|
||||
assert num_dummy_run == expected_dummy_run, (
|
||||
f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
|
||||
)
|
||||
|
||||
# Metric 4: Graph Replay (Inference Execution)
|
||||
# Replays happen for every aligned step across all graphs.
|
||||
expected_replay = num_acl_graphs * aligned_steps * dp_size
|
||||
|
||||
assert (
|
||||
actual_replay == expected_replay
|
||||
), f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
|
||||
assert actual_replay == expected_replay, f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
|
||||
|
||||
@@ -64,12 +64,8 @@ def test_qwen3_inference_dp2(model, max_tokens):
|
||||
cmd.append("ascend")
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, timeout=600)
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ MODELS = [
|
||||
SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
|
||||
TENSOR_PARALLELS = [1]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
@@ -36,36 +37,61 @@ async def test_models(model: str, tp_size: int) -> None:
|
||||
vllm_server_args = [
|
||||
[
|
||||
"--port",
|
||||
str(encode_port), "--model", model, "--gpu-memory-utilization",
|
||||
"0.01", "--tensor-parallel-size",
|
||||
str(tp_size), "--enforce-eager", "--no-enable-prefix-caching",
|
||||
"--max-model-len", "10000", "--max-num-batched-tokens", "10000",
|
||||
"--max-num-seqs", "1", "--ec-transfer-config",
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"' +
|
||||
SHARED_STORAGE_PATH +
|
||||
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
|
||||
str(encode_port),
|
||||
"--model",
|
||||
model,
|
||||
"--gpu-memory-utilization",
|
||||
"0.01",
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--enforce-eager",
|
||||
"--no-enable-prefix-caching",
|
||||
"--max-model-len",
|
||||
"10000",
|
||||
"--max-num-batched-tokens",
|
||||
"10000",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
"--ec-transfer-config",
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"'
|
||||
+ SHARED_STORAGE_PATH
|
||||
+ '"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}',
|
||||
],
|
||||
[
|
||||
"--port",
|
||||
str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95",
|
||||
str(pd_port),
|
||||
"--model",
|
||||
model,
|
||||
"--gpu-memory-utilization",
|
||||
"0.95",
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size), "--enforce-eager", "--max-model-len", "10000",
|
||||
"--max-num-batched-tokens", "10000", "--max-num-seqs", "128",
|
||||
str(tp_size),
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"10000",
|
||||
"--max-num-batched-tokens",
|
||||
"10000",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--ec-transfer-config",
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"' +
|
||||
SHARED_STORAGE_PATH +
|
||||
'"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
|
||||
]
|
||||
'{"ec_connector_extra_config":{"shared_storage_path":"'
|
||||
+ SHARED_STORAGE_PATH
|
||||
+ '"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}',
|
||||
],
|
||||
]
|
||||
proxy_port = get_open_port()
|
||||
proxy_args = [
|
||||
"--host", "127.0.0.1", "--port",
|
||||
str(proxy_port), "--encode-servers-urls",
|
||||
f"http://localhost:{encode_port}", "--decode-servers-urls",
|
||||
f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable"
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--port",
|
||||
str(proxy_port),
|
||||
"--encode-servers-urls",
|
||||
f"http://localhost:{encode_port}",
|
||||
"--decode-servers-urls",
|
||||
f"http://localhost:{pd_port}",
|
||||
"--prefill-servers-urls",
|
||||
"disable",
|
||||
]
|
||||
|
||||
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _:
|
||||
with DisaggEpdProxy(proxy_args=proxy_args) as proxy:
|
||||
send_image_request(model, proxy)
|
||||
|
||||
with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _, DisaggEpdProxy(proxy_args=proxy_args) as proxy:
|
||||
send_image_request(model, proxy)
|
||||
|
||||
@@ -15,15 +15,12 @@ def test_deepseek_correctness_ep(model_name):
|
||||
max_tokens = 5
|
||||
|
||||
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
||||
with VllmRunner(model_name,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
tensor_parallel_size=2) as vllm_model:
|
||||
with VllmRunner(model_name, cudagraph_capture_sizes=[1, 2, 4, 8], tensor_parallel_size=2) as vllm_model:
|
||||
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(model_name,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
with VllmRunner(
|
||||
model_name, tensor_parallel_size=2, cudagraph_capture_sizes=[1, 2, 4, 8], enable_expert_parallel=True
|
||||
) as vllm_model:
|
||||
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
|
||||
@@ -29,6 +29,7 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
import torch_npu
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
|
||||
from tests.e2e.conftest import wait_until_npu_memory_free
|
||||
|
||||
MODELS = ["Qwen/Qwen3-0.6B"]
|
||||
@@ -39,9 +40,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"})
|
||||
def test_qwen3_external_launcher(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
@@ -68,7 +67,7 @@ def test_qwen3_external_launcher(model):
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
@@ -81,16 +80,24 @@ def test_qwen3_external_launcher(model):
|
||||
@pytest.mark.parametrize("model", MOE_MODELS)
|
||||
@wait_until_npu_memory_free()
|
||||
def test_qwen3_moe_external_launcher_ep_tp2(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script), "--model", model, "--tp-size", "2", "--node-size", "1",
|
||||
"--node-rank", "0", "--proc-per-node", "2", "--trust-remote-code",
|
||||
"--enable-expert-parallel"
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--tp-size",
|
||||
"2",
|
||||
"--node-size",
|
||||
"1",
|
||||
"--node-rank",
|
||||
"0",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enable-expert-parallel",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
@@ -101,7 +108,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
@@ -113,9 +120,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||
@wait_until_npu_memory_free()
|
||||
def test_qwen3_external_launcher_with_sleepmode():
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
@@ -147,7 +152,7 @@ def test_qwen3_external_launcher_with_sleepmode():
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=300,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
@@ -158,9 +163,7 @@ def test_qwen3_external_launcher_with_sleepmode():
|
||||
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||
def test_qwen3_external_launcher_with_sleepmode_level2():
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
model_path = snapshot_download("Qwen/Qwen3-8B")
|
||||
# TODO: Add moe model test
|
||||
@@ -195,7 +198,7 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=300,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
@@ -210,14 +213,9 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@wait_until_npu_memory_free()
|
||||
@patch.dict(os.environ, {
|
||||
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1",
|
||||
"HCCL_BUFFSIZE": "500"
|
||||
})
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1", "HCCL_BUFFSIZE": "500"})
|
||||
def test_qwen3_external_launcher_with_matmul_allreduce(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
cmd = [
|
||||
sys.executable,
|
||||
@@ -236,7 +234,7 @@ def test_qwen3_external_launcher_with_matmul_allreduce(model):
|
||||
timeout=600,
|
||||
)
|
||||
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
print(output)
|
||||
|
||||
assert "Generated text:" in output
|
||||
|
||||
@@ -26,41 +26,39 @@ from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
|
||||
def test_qwen3_moe_full_decode_only_tp2():
|
||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
if "HCCL_OP_EXPANSION_MODE" in os.environ:
|
||||
del os.environ["HCCL_OP_EXPANSION_MODE"]
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
model = "Qwen/Qwen3-30B-A3B"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
|
||||
}) as runner:
|
||||
vllm_fullgraph_outputs = runner.model.generate(prompts,
|
||||
sampling_params)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
|
||||
) as runner:
|
||||
vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
||||
tensor_parallel_size=2,
|
||||
model,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
||||
tensor_parallel_size=2,
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_fullgraph_outputs_list = []
|
||||
for output in vllm_fullgraph_outputs:
|
||||
vllm_fullgraph_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
vllm_eager_outputs_list = []
|
||||
for output in vllm_eager_outputs:
|
||||
vllm_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
@@ -72,41 +70,39 @@ def test_qwen3_moe_full_decode_only_tp2():
|
||||
|
||||
@pytest.mark.skip(reason="CANN8.5 failed with this test, fix me")
|
||||
def test_qwen3_moe_full_graph_tp2():
|
||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
if "HCCL_OP_EXPANSION_MODE" in os.environ:
|
||||
del os.environ["HCCL_OP_EXPANSION_MODE"]
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
model = "Qwen/Qwen3-30B-A3B"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
|
||||
}) as runner:
|
||||
vllm_fullgraph_outputs = runner.model.generate(prompts,
|
||||
sampling_params)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={"cudagraph_mode": "FULL", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
|
||||
) as runner:
|
||||
vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
||||
tensor_parallel_size=2,
|
||||
model,
|
||||
max_model_len=1024,
|
||||
cudagraph_capture_sizes=[4, 8, 24, 48, 60],
|
||||
tensor_parallel_size=2,
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_fullgraph_outputs_list = []
|
||||
for output in vllm_fullgraph_outputs:
|
||||
vllm_fullgraph_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
vllm_eager_outputs_list = []
|
||||
for output in vllm_eager_outputs:
|
||||
vllm_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
|
||||
@@ -1,23 +1,22 @@
|
||||
import pytest
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
|
||||
MODEL_PATH, do_sample)
|
||||
from tests.e2e.singlecard.test_ilama_lora import EXPECTED_LORA_OUTPUT, MODEL_PATH, do_sample
|
||||
|
||||
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
|
||||
def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
|
||||
with VllmRunner(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
dtype="half",
|
||||
max_model_len=1024,
|
||||
max_num_seqs=16,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
dtype="half",
|
||||
max_model_len=1024,
|
||||
max_num_seqs=16,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
|
||||
|
||||
|
||||
@@ -20,8 +20,10 @@
|
||||
|
||||
Run `pytest tests/test_offline_inference.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
@@ -51,6 +53,7 @@ GPT_OSS_MODELS = [
|
||||
"unsloth/gpt-oss-20b-BF16",
|
||||
]
|
||||
|
||||
|
||||
def test_deepseek_multistream_moe_tp2():
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
@@ -58,15 +61,15 @@ def test_deepseek_multistream_moe_tp2():
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"vllm-ascend/DeepSeek-V3-Pruning",
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
additional_config={
|
||||
"enable_multistream_moe": True,
|
||||
"refresh": True,
|
||||
},
|
||||
"vllm-ascend/DeepSeek-V3-Pruning",
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
additional_config={
|
||||
"enable_multistream_moe": True,
|
||||
"refresh": True,
|
||||
},
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -78,12 +81,12 @@ def test_qwen3_w4a8_dynamic_tp2(model):
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts, max_tokens)
|
||||
|
||||
@@ -92,20 +95,17 @@ def test_qwen3_moe_sp_tp2() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
compilation_config={"pass_config": {
|
||||
"enable_sp": True
|
||||
}},
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True) as vllm_model:
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
compilation_config={"pass_config": {"enable_sp": True}},
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@@ -113,33 +113,34 @@ def test_qwen3_moe_sp_tp2() -> None:
|
||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
|
||||
def test_deepseek_w4a8_accuracy_tp2(model):
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs"
|
||||
]
|
||||
vllm_ds_w4a8_answers = [
|
||||
'逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs",
|
||||
]
|
||||
vllm_ds_w4a8_answers = ["逍遙而至地去 accrued", "平行于我udo madreHelen", "ysteepaolis backwards Kj"]
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
enable_expert_parallel=True) as vllm_model:
|
||||
vllm_quant_outputs = vllm_model.model.generate(prompts,
|
||||
sampling_params)
|
||||
with VllmRunner(
|
||||
model,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
enable_expert_parallel=True,
|
||||
) as vllm_model:
|
||||
vllm_quant_outputs = vllm_model.model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_quant_outputs_list = []
|
||||
for output in vllm_quant_outputs:
|
||||
vllm_quant_outputs_list.append(
|
||||
([output.outputs[0].index], output.outputs[0].text))
|
||||
vllm_quant_outputs_list.append(([output.outputs[0].index], output.outputs[0].text))
|
||||
vllm_answer_list = []
|
||||
vllm_answer_list = ([([0], answer) for answer in vllm_ds_w4a8_answers])
|
||||
vllm_answer_list = [([0], answer) for answer in vllm_ds_w4a8_answers]
|
||||
|
||||
check_outputs_equal(outputs_0_lst=vllm_answer_list,
|
||||
outputs_1_lst=vllm_quant_outputs_list,
|
||||
name_0="vllm_quant_outputs",
|
||||
name_1="vllm_answer_outputs")
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_answer_list,
|
||||
outputs_1_lst=vllm_quant_outputs_list,
|
||||
name_0="vllm_quant_outputs",
|
||||
name_1="vllm_answer_outputs",
|
||||
)
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
||||
@@ -148,17 +149,16 @@ def test_qwen3_moe_fc2_tp2() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True) as vllm_model:
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@@ -168,20 +168,17 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
||||
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=
|
||||
True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen
|
||||
additional_config={"layer_sharding": ["o_proj"]}) as vllm_model:
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen
|
||||
additional_config={"layer_sharding": ["o_proj"]},
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@@ -190,17 +187,16 @@ def test_deepseek_v2_lite_fc1_tp2() -> None:
|
||||
example_prompts = [
|
||||
"test" * 1001,
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
with VllmRunner("vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True,
|
||||
quantization="ascend") as vllm_model:
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
||||
with VllmRunner(
|
||||
"vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True,
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@@ -213,12 +209,12 @@ def test_qwen3_dense_fc1_tp2(model):
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -232,13 +228,13 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
additional_config={"weight_prefetch_config": {"enabled": True}},
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
additional_config={"weight_prefetch_config": {"enabled": True}},
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -252,28 +248,20 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
|
||||
"Hello ",
|
||||
]
|
||||
# "max_position_embeddings": 163840,
|
||||
long_example_prompts = [
|
||||
"Hello " * (163839 - 500) + "Hello"
|
||||
]
|
||||
long_example_prompts = ["Hello " * (163839 - 500) + "Hello"]
|
||||
max_tokens = 500
|
||||
with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
enable_expert_parallel=True,
|
||||
max_model_len=163840,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
},
|
||||
speculative_config={
|
||||
"num_speculative_tokens": 1,
|
||||
"method": "deepseek_mtp"
|
||||
},
|
||||
additional_config={
|
||||
"layer_sharding":["q_b_proj", "o_proj"]
|
||||
},
|
||||
reasoning_parser="deepseek_v3",
|
||||
tokenizer_mode="deepseek_v32") as vllm_model:
|
||||
with VllmRunner(
|
||||
"vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
enable_expert_parallel=True,
|
||||
max_model_len=163840,
|
||||
compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||
speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
|
||||
additional_config={"layer_sharding": ["q_b_proj", "o_proj"]},
|
||||
reasoning_parser="deepseek_v3",
|
||||
tokenizer_mode="deepseek_v32",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(short_example_prompts, max_tokens)
|
||||
vllm_model.generate_greedy(long_example_prompts, max_tokens)
|
||||
|
||||
@@ -285,10 +273,10 @@ def test_qwen3_w4a4_distributed_tp2(model):
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
model,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
model,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -300,8 +288,8 @@ def test_gpt_oss_distributed_tp2(model):
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
model,
|
||||
tensor_parallel_size=2,
|
||||
enforce_eager=True,
|
||||
model,
|
||||
tensor_parallel_size=2,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -32,9 +32,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B"]
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||
def test_qwen3_offline_load_and_sleepmode_tp2(model):
|
||||
script = Path(
|
||||
__file__
|
||||
).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||
env = os.environ.copy()
|
||||
cmd = [
|
||||
sys.executable,
|
||||
@@ -65,7 +63,7 @@ def test_qwen3_offline_load_and_sleepmode_tp2(model):
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode(errors='ignore')
|
||||
output = proc.stdout.decode(errors="ignore")
|
||||
|
||||
print(output)
|
||||
|
||||
|
||||
@@ -37,12 +37,13 @@ prompts = [
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND)
|
||||
def test_models_pp2(model: str, tp_size: int, pp_size: int,
|
||||
distributed_executor_backend: str) -> None:
|
||||
with VllmRunner(model,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
def test_models_pp2(model: str, tp_size: int, pp_size: int, distributed_executor_backend: str) -> None:
|
||||
with VllmRunner(
|
||||
model,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
gpu_memory_utilization=0.7,
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts, 64)
|
||||
|
||||
@@ -11,11 +11,14 @@ MODELS = [
|
||||
# for MHA
|
||||
"Qwen/Qwen3-8B",
|
||||
# for MLA
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
]
|
||||
|
||||
# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
|
||||
LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
|
||||
# ruff: noqa: E501
|
||||
LONG_PROMPT = (
|
||||
"You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
|
||||
+ """
|
||||
| ID | Name | Age | Occupation | Country | Email | Phone Number | Address |
|
||||
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
|
||||
| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL |
|
||||
@@ -49,32 +52,34 @@ LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables i
|
||||
| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ |
|
||||
| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE |
|
||||
"""
|
||||
)
|
||||
|
||||
INPUT_PROMPTS = [
|
||||
LONG_PROMPT +
|
||||
"Question: what is the age of John Doe? Your answer: The age of John Doe is ",
|
||||
LONG_PROMPT +
|
||||
"Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is "
|
||||
LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
|
||||
LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [50])
|
||||
def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None:
|
||||
with VllmRunner(model,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
prefix_cache_output = vllm_model.generate_greedy(
|
||||
INPUT_PROMPTS, max_tokens)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7,
|
||||
) as vllm_model:
|
||||
prefix_cache_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
|
||||
|
||||
with VllmRunner(model,
|
||||
enable_prefix_caching=False,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
with VllmRunner(
|
||||
model,
|
||||
enable_prefix_caching=False,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
gpu_memory_utilization=0.7,
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
|
||||
#
|
||||
import pytest
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
@@ -27,16 +26,16 @@ def test_qwen2_5_w8a8_external_quantized_tp2():
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"neuralmagic/Qwen2.5-3B-quantized.w8a8",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
"neuralmagic/Qwen2.5-3B-quantized.w8a8",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
golden_results = [
|
||||
'The president of the United States is the head of state and',
|
||||
"The president of the United States is the head of state and",
|
||||
]
|
||||
|
||||
for i in range(len(vllm_output)):
|
||||
@@ -50,36 +49,37 @@ def test_qwen3_moe_w8a8_dynamic_llm_compressor():
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
golden_results = [
|
||||
'The president of the United States is the head of state and',
|
||||
"The president of the United States is the head of state and",
|
||||
]
|
||||
|
||||
for i in range(len(vllm_output)):
|
||||
assert golden_results[i] == vllm_output[i][1]
|
||||
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||
|
||||
|
||||
def test_qwen3_moe_w4a8_dynamic_llm_compressor():
|
||||
example_prompts = [
|
||||
"The president of the United States is",
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
golden_results = [
|
||||
'The president of the United States is the head of state and',
|
||||
"The president of the United States is the head of state and",
|
||||
]
|
||||
|
||||
for i in range(len(vllm_output)):
|
||||
|
||||
@@ -34,11 +34,11 @@ def test_qwen3_moe_distributed_mp_tp2_ep():
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -49,27 +49,27 @@ def test_qwen3_moe_w8a8_distributed_tp2():
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"vllm-ascend/Qwen3-30B-A3B-W8A8",
|
||||
max_model_len=8192,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
"vllm-ascend/Qwen3-30B-A3B-W8A8",
|
||||
max_model_len=8192,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
def test_qwen3_moe_distributed_aiv_tp2():
|
||||
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
|
||||
os.environ["HCCL_OP_EXPANSION_MODE"] = "AIV"
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
dtype = "auto"
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -80,23 +80,24 @@ async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
|
||||
port = get_open_port()
|
||||
compilation_config = json.dumps({"cudagraph_capture_sizes": [8]})
|
||||
server_args = [
|
||||
"--max_model_len", "8192", "--tensor_parallel_size", "2",
|
||||
"--enable_expert_parallel", "--quantization", "ascend", "--port",
|
||||
str(port), "--compilation-config", compilation_config
|
||||
"--max_model_len",
|
||||
"8192",
|
||||
"--tensor_parallel_size",
|
||||
"2",
|
||||
"--enable_expert_parallel",
|
||||
"--quantization",
|
||||
"ascend",
|
||||
"--port",
|
||||
str(port),
|
||||
"--compilation-config",
|
||||
compilation_config,
|
||||
]
|
||||
env_dict = {"HCCL_BUFFSIZE": "1024"}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
auto_port=False,
|
||||
env_dict=env_dict) as server:
|
||||
with RemoteOpenAIServer(model, server_args, server_port=port, auto_port=False, env_dict=env_dict) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(model=model,
|
||||
prompt="What is deeplearning?",
|
||||
max_tokens=400,
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
n=1)
|
||||
batch = await client.completions.create(
|
||||
model=model, prompt="What is deeplearning?", max_tokens=400, temperature=0, top_p=1.0, n=1
|
||||
)
|
||||
gt_choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
|
||||
# dynamic eplb test
|
||||
@@ -108,22 +109,14 @@ async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
|
||||
"dynamic_eplb": True,
|
||||
"expert_heat_collection_interval": 100,
|
||||
"algorithm_execution_interval": 20,
|
||||
"num_redundant_experts": 2
|
||||
"num_redundant_experts": 2,
|
||||
}
|
||||
}
|
||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
auto_port=False,
|
||||
env_dict=env_dict) as server:
|
||||
with RemoteOpenAIServer(model, server_args, server_port=port, auto_port=False, env_dict=env_dict) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(model=model,
|
||||
prompt="What is deeplearning?",
|
||||
max_tokens=400,
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
n=1)
|
||||
batch = await client.completions.create(
|
||||
model=model, prompt="What is deeplearning?", max_tokens=400, temperature=0, top_p=1.0, n=1
|
||||
)
|
||||
eplb_choices: list[openai.types.CompletionChoice] = batch.choices
|
||||
assert gt_choices[0].text == eplb_choices[
|
||||
0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"
|
||||
assert gt_choices[0].text == eplb_choices[0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from vllm import SamplingParams
|
||||
from vllm.sampling_params import RequestOutputKind
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"OMP_NUM_THREADS": "1"})
|
||||
def test_qwen3_moe_routing_replay():
|
||||
@@ -12,18 +13,15 @@ def test_qwen3_moe_routing_replay():
|
||||
"Hello, please introduce yourself.",
|
||||
]
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
enable_return_routed_experts=True,
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
distributed_executor_backend="mp",
|
||||
enable_return_routed_experts=True,
|
||||
) as vllm_model:
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=5,
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
output_kind=RequestOutputKind.FINAL_ONLY
|
||||
max_tokens=5, temperature=0.8, top_p=0.95, output_kind=RequestOutputKind.FINAL_ONLY
|
||||
)
|
||||
inputs = vllm_model.get_inputs(prompts=prompts)
|
||||
outputs = vllm_model.model.generate(prompts=inputs, sampling_params=sampling_params)
|
||||
|
||||
@@ -84,11 +84,7 @@ async def test_models(model: str) -> None:
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
with RemoteOpenAIServer(model, server_args, server_port=port, env_dict=env_dict, auto_port=False) as server:
|
||||
client = server.get_async_client()
|
||||
batch = await client.completions.create(
|
||||
model=model,
|
||||
|
||||
@@ -13,69 +13,65 @@ MODELS = [
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None:
|
||||
|
||||
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
|
||||
del os.environ['HCCL_OP_EXPANSION_MODE']
|
||||
if "HCCL_OP_EXPANSION_MODE" in os.environ:
|
||||
del os.environ["HCCL_OP_EXPANSION_MODE"]
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is", "The capital of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
"Hello, my name is",
|
||||
"The capital of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
os.environ["VLLM_ASCEND_ENABLE_FLASHCOMM1"] = "1"
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"enable_shared_expert_dp": True,
|
||||
},
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={
|
||||
"enable_shared_expert_dp": True,
|
||||
},
|
||||
) as runner:
|
||||
shared_expert_dp_eager_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
shared_expert_dp_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [1, 4, 8, 16],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
},
|
||||
additional_config={
|
||||
"enable_shared_expert_dp": True,
|
||||
},
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [1, 4, 8, 16],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
},
|
||||
additional_config={
|
||||
"enable_shared_expert_dp": True,
|
||||
},
|
||||
) as runner:
|
||||
shared_expert_dp_aclgraph_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
shared_expert_dp_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_eager_outputs_list = []
|
||||
for output in vllm_eager_outputs:
|
||||
vllm_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
shared_expert_dp_eager_outputs_list = []
|
||||
for output in shared_expert_dp_eager_outputs:
|
||||
shared_expert_dp_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
shared_expert_dp_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
shared_expert_dp_aclgraph_outputs_list = []
|
||||
for output in shared_expert_dp_aclgraph_outputs:
|
||||
shared_expert_dp_aclgraph_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
shared_expert_dp_aclgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
|
||||
@@ -39,8 +39,7 @@ api_keyword_args = {
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
|
||||
async def test_models_single_request_aclgraph_dp2(model: str,
|
||||
dp_size: int) -> None:
|
||||
async def test_models_single_request_aclgraph_dp2(model: str, dp_size: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {
|
||||
"TASK_QUEUE_ENABLE": "1",
|
||||
@@ -48,36 +47,51 @@ async def test_models_single_request_aclgraph_dp2(model: str,
|
||||
}
|
||||
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
|
||||
server_args = [
|
||||
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
|
||||
"--no-enable-prefix-caching",
|
||||
"--tensor-parallel-size",
|
||||
"1",
|
||||
"--data-parallel-size",
|
||||
str(dp_size), "--quantization", "ascend", "--max-model-len",
|
||||
"1024", "--port",
|
||||
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
|
||||
str(dp_size),
|
||||
"--quantization",
|
||||
"ascend",
|
||||
"--max-model-len",
|
||||
"1024",
|
||||
"--port",
|
||||
str(port),
|
||||
"--trust-remote-code",
|
||||
"--gpu-memory-utilization",
|
||||
"0.9",
|
||||
]
|
||||
else:
|
||||
server_args = [
|
||||
"--no-enable-prefix-caching", "--tensor-parallel-size", "1",
|
||||
"--no-enable-prefix-caching",
|
||||
"--tensor-parallel-size",
|
||||
"1",
|
||||
"--data-parallel-size",
|
||||
str(dp_size), "--port",
|
||||
str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
|
||||
str(dp_size),
|
||||
"--port",
|
||||
str(port),
|
||||
"--trust-remote-code",
|
||||
"--gpu-memory-utilization",
|
||||
"0.9",
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
}
|
||||
with RemoteOpenAIServer(model,
|
||||
vllm_serve_args=server_args,
|
||||
server_port=port,
|
||||
env_dict=env_dict,
|
||||
auto_port=False) as server:
|
||||
with RemoteOpenAIServer(
|
||||
model, vllm_serve_args=server_args, server_port=port, env_dict=env_dict, auto_port=False
|
||||
) as server:
|
||||
client = server.get_async_client()
|
||||
|
||||
try:
|
||||
batch = await asyncio.wait_for(client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
),
|
||||
timeout=10.0)
|
||||
batch = await asyncio.wait_for(
|
||||
client.completions.create(
|
||||
model=model,
|
||||
prompt=prompts,
|
||||
**request_keyword_args,
|
||||
),
|
||||
timeout=10.0,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
pytest.fail("Model did not return response within 10 seconds")
|
||||
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
@@ -14,47 +12,46 @@ MODELS = [
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_qwen3_vl_sp_tp2(model: str) -> None:
|
||||
prompts = [
|
||||
"Hello, my name is", "The capital of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
"Hello, my name is",
|
||||
"The capital of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=10, temperature=0.0)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [2, 4],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"pass_config": {"enable_sp": False}
|
||||
},
|
||||
additional_config={"ascend_compilation_config": {"enable_npugraph_ex": False}}
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [2, 4],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"pass_config": {"enable_sp": False},
|
||||
},
|
||||
additional_config={"ascend_compilation_config": {"enable_npugraph_ex": False}},
|
||||
) as runner:
|
||||
no_sp_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [2, 4],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"pass_config": {"enable_sp": True}
|
||||
},
|
||||
additional_config={"sp_threshold": 10, "ascend_compilation_config": {"enable_npugraph_ex": False}}
|
||||
model,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [2, 4],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"pass_config": {"enable_sp": True},
|
||||
},
|
||||
additional_config={"sp_threshold": 10, "ascend_compilation_config": {"enable_npugraph_ex": False}},
|
||||
) as runner:
|
||||
sp_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
sp_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
no_sp_outputs_list = []
|
||||
for output in no_sp_outputs:
|
||||
no_sp_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
no_sp_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
sp_outputs_list = []
|
||||
for output in sp_outputs:
|
||||
sp_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
sp_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=no_sp_outputs_list,
|
||||
|
||||
Reference in New Issue
Block a user