### What this PR does / why we need it?
**Scope of Changes**:
| File Path |
| :--- |
| `tests/e2e/310p/multicard/test_vl_model_multicard.py` |
| `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` |
| `tests/e2e/310p/test_utils.py` |
| `tests/e2e/conftest.py` |
| `tests/e2e/model_utils.py` |
| `tests/e2e/models/conftest.py` |
| `tests/e2e/models/test_lm_eval_correctness.py` |
| `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` |
| `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` |
| `tests/e2e/multicard/2-cards/test_data_parallel.py` |
| `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` |
| `tests/e2e/multicard/2-cards/test_expert_parallel.py` |
| `tests/e2e/multicard/2-cards/test_external_launcher.py` |
| `tests/e2e/multicard/2-cards/test_full_graph_mode.py` |
| `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` |
| `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` |
| `tests/e2e/multicard/2-cards/test_offline_weight_load.py` |
| `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` |
| `tests/e2e/multicard/2-cards/test_prefix_caching.py` |
| `tests/e2e/multicard/2-cards/test_quantization.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` |
| `tests/e2e/multicard/2-cards/test_qwen3_performance.py` |
| `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` |
| `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` |
| `tests/e2e/multicard/2-cards/test_sp_pass.py` |
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
9562912cea
Signed-off-by: MrZ20 <2609716663@qq.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -25,8 +25,8 @@ import pytest
|
||||
import torch
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
||||
from tests.e2e.conftest import wait_until_npu_memory_free
|
||||
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
||||
|
||||
MODELS = [
|
||||
# Offline data parallel mode will be not supported/useful for dense models
|
||||
@@ -58,8 +58,7 @@ def _install_spies(counters: dict[str, Any]) -> contextlib.ExitStack:
|
||||
]
|
||||
|
||||
for cls, method, counter in hooks:
|
||||
stack.enter_context(
|
||||
patch.object(cls, method, make_spy(cls, method, counter)))
|
||||
stack.enter_context(patch.object(cls, method, make_spy(cls, method, counter)))
|
||||
|
||||
return stack
|
||||
|
||||
@@ -75,18 +74,19 @@ def _run_worker_process(
|
||||
max_tokens: int,
|
||||
):
|
||||
"""Main entry point for the worker process."""
|
||||
os.environ.update({
|
||||
"VLLM_DP_RANK": str(rank),
|
||||
"VLLM_DP_RANK_LOCAL": str(local_rank),
|
||||
"VLLM_DP_SIZE": str(world_size),
|
||||
"VLLM_DP_MASTER_IP": master_ip,
|
||||
"VLLM_DP_MASTER_PORT": str(master_port),
|
||||
})
|
||||
os.environ.update(
|
||||
{
|
||||
"VLLM_DP_RANK": str(rank),
|
||||
"VLLM_DP_RANK_LOCAL": str(local_rank),
|
||||
"VLLM_DP_SIZE": str(world_size),
|
||||
"VLLM_DP_MASTER_IP": master_ip,
|
||||
"VLLM_DP_MASTER_PORT": str(master_port),
|
||||
}
|
||||
)
|
||||
|
||||
# Import vLLM only after environment setup
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import (
|
||||
destroy_distributed_environment, destroy_model_parallel)
|
||||
from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
|
||||
|
||||
# Apply hooks and run inference
|
||||
with _install_spies(counters):
|
||||
@@ -100,23 +100,20 @@ def _run_worker_process(
|
||||
# Simple data sharding
|
||||
chunk_size = len(prompts) // world_size
|
||||
start_idx = rank * chunk_size
|
||||
end_idx = start_idx + chunk_size if rank < world_size - 1 else len(
|
||||
prompts)
|
||||
end_idx = start_idx + chunk_size if rank < world_size - 1 else len(prompts)
|
||||
local_prompts = prompts[start_idx:end_idx]
|
||||
|
||||
llm = LLM(
|
||||
model=model_path,
|
||||
quantization="ascend" if "W8A8" in model_path else None,
|
||||
enable_expert_parallel=True if "DeepSeek" in model_path else False,
|
||||
enable_expert_parallel="DeepSeek" in model_path,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
# Expose model config to the main test process
|
||||
counters["hidden_layers"].value = (
|
||||
llm.llm_engine.model_config.hf_text_config.num_hidden_layers)
|
||||
counters["hidden_layers"].value = llm.llm_engine.model_config.hf_text_config.num_hidden_layers
|
||||
|
||||
llm.generate(local_prompts,
|
||||
SamplingParams(max_tokens=max_tokens, temperature=0.0))
|
||||
llm.generate(local_prompts, SamplingParams(max_tokens=max_tokens, temperature=0.0))
|
||||
|
||||
# Explicit cleanup is mandatory in multi-process vLLM tests
|
||||
del llm
|
||||
@@ -162,8 +159,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
for rank in range(dp_size):
|
||||
p = multiprocessing.Process(
|
||||
target=_run_worker_process,
|
||||
args=(rank, rank, dp_size, "127.0.0.1", port, counters, model,
|
||||
max_tokens),
|
||||
args=(rank, rank, dp_size, "127.0.0.1", port, counters, model, max_tokens),
|
||||
)
|
||||
p.start()
|
||||
workers.append(p)
|
||||
@@ -175,8 +171,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
for k in workers:
|
||||
if k.is_alive():
|
||||
k.kill()
|
||||
raise RuntimeError(
|
||||
f"Worker {p.pid} failed with exit code {p.exitcode}")
|
||||
raise RuntimeError(f"Worker {p.pid} failed with exit code {p.exitcode}")
|
||||
|
||||
actual_capture = counters["capture"].value
|
||||
actual_replay = counters["replay"].value
|
||||
@@ -185,18 +180,16 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
num_layers = counters["hidden_layers"].value
|
||||
|
||||
num_acl_graphs = num_layers + 1
|
||||
num_comm_groups = sum(1 for s in [dp_size, 1]
|
||||
if s > 1) # dp_size=2, tp_size=1
|
||||
num_comm_groups = sum(1 for s in [dp_size, 1] if s > 1) # dp_size=2, tp_size=1
|
||||
|
||||
# Metric 1: Graph Capture (ACL Graph Construction)
|
||||
# Ref: vllm_ascend.utils.update_aclgraph_sizes
|
||||
max_batch_sizes = math.floor((1800 - num_comm_groups * 40) /
|
||||
num_acl_graphs / (1 + num_comm_groups * 2))
|
||||
max_batch_sizes = math.floor((1800 - num_comm_groups * 40) / num_acl_graphs / (1 + num_comm_groups * 2))
|
||||
|
||||
expected_capture = max_batch_sizes * num_acl_graphs * dp_size
|
||||
assert (
|
||||
actual_capture == expected_capture
|
||||
), f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
|
||||
assert actual_capture == expected_capture, (
|
||||
f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
|
||||
)
|
||||
|
||||
# Metric 2: Model Execution (NPUModelRunner.execute_model)
|
||||
# vLLM Step Breakdown:
|
||||
@@ -207,9 +200,9 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
# vllm default enables Async scheduler, this will take 1 more steps
|
||||
expected_exec_model = (total_steps + 1 + 1) * dp_size
|
||||
|
||||
assert (
|
||||
num_execute_model == expected_exec_model
|
||||
), f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
|
||||
assert num_execute_model == expected_exec_model, (
|
||||
f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
|
||||
)
|
||||
|
||||
# Metric 3: Dummy Runs (Warmup & Alignment)
|
||||
# vLLM synchronizes globally every 32 steps.
|
||||
@@ -228,14 +221,12 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
|
||||
|
||||
expected_dummy_run = (warmup_runs + padding_runs) * dp_size
|
||||
|
||||
assert (
|
||||
num_dummy_run == expected_dummy_run
|
||||
), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
|
||||
assert num_dummy_run == expected_dummy_run, (
|
||||
f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
|
||||
)
|
||||
|
||||
# Metric 4: Graph Replay (Inference Execution)
|
||||
# Replays happen for every aligned step across all graphs.
|
||||
expected_replay = num_acl_graphs * aligned_steps * dp_size
|
||||
|
||||
assert (
|
||||
actual_replay == expected_replay
|
||||
), f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
|
||||
assert actual_replay == expected_replay, f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
|
||||
|
||||
Reference in New Issue
Block a user