ACLgraph enable: Test cases revisions for all features (#3388)

### What this PR does / why we need it?
This PR revise the test cases of various features on the warehouse which
add the enablement of aclgraph to the test cases.

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
ut

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

Signed-off-by: lilinsiman <lilinsiman@gmail.com>
This commit is contained in:
lilinsiman
2025-10-17 17:15:19 +08:00
committed by GitHub
parent bf87606932
commit 1b424fb7f1
17 changed files with 34 additions and 117 deletions

View File

@@ -52,8 +52,8 @@ def test_data_parallel_inference(model, max_tokens):
"--node-rank",
"0",
"--trust-remote-code",
"--enforce-eager",
]
if model == "Qwen/Qwen3-30B-A3B":
cmd.append("--enable-expert-parallel")

View File

@@ -21,7 +21,7 @@ def test_e2e_ep_correctness(model_name):
additional_config={"ascend_scheduler_config": {
"enabled": True
}},
enforce_eager=True) as vllm_model:
enforce_eager=False) as vllm_model:
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(
@@ -31,7 +31,7 @@ def test_e2e_ep_correctness(model_name):
additional_config={"ascend_scheduler_config": {
"enabled": True
}},
enforce_eager=True) as vllm_model:
enforce_eager=False) as vllm_model:
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(

View File

@@ -16,7 +16,7 @@ def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
max_num_seqs=16,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
enforce_eager=False) as vllm_model:
output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):

View File

@@ -52,7 +52,7 @@ def test_models_distributed_QwQ():
dtype=dtype,
tensor_parallel_size=2,
distributed_executor_backend="mp",
enforce_eager=True,
enforce_eager=False,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -163,11 +163,10 @@ def test_sp_for_qwen3_moe() -> None:
vllm_model.generate(example_prompts, sampling_params)
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model):
example_prompts = [
"Hello, my name is",
]
@@ -176,7 +175,7 @@ def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
with VllmRunner(
snapshot_download(model),
max_model_len=8192,
enforce_eager=enforce_eager,
enforce_eager=False,
dtype="auto",
tensor_parallel_size=2,
quantization="ascend",
@@ -184,12 +183,10 @@ def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
vllm_model.generate_greedy(example_prompts, max_tokens)
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(
model, enforce_eager):
def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(model):
example_prompts = [
"Hello, my name is",
]
@@ -198,7 +195,7 @@ def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(
with VllmRunner(
snapshot_download(model),
max_model_len=8192,
enforce_eager=enforce_eager,
enforce_eager=False,
dtype="auto",
tensor_parallel_size=2,
quantization="ascend",

View File

@@ -62,7 +62,7 @@ INPUT_PROMPTS = [
@pytest.mark.parametrize("max_tokens", [50])
def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
with VllmRunner(model,
enforce_eager=True,
enforce_eager=False,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
@@ -71,7 +71,7 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
with VllmRunner(model,
enable_prefix_caching=False,
enforce_eager=True,
enforce_eager=False,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
@@ -96,7 +96,7 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
'enabled': True,
},
},
enforce_eager=True,
enforce_eager=False,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
@@ -109,7 +109,7 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
'enable_prefix_caching': True,
},
},
enforce_eager=True,
enforce_eager=False,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:

View File

@@ -33,47 +33,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@pytest.mark.parametrize("model", MOE_MODELS)
def test_external_launcher_eager(model):
script = script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"2",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enforce-eager",
"--enable-expert-parallel",
"--enable-sleep-mode",
"--model-weight-gib",
"20",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode()
print(output)
assert "TP RANKS: [0]" in output
assert "TP RANKS: [1]" in output
assert "Generated text:" in output
assert proc.returncode == 0
@pytest.mark.parametrize("model", MOE_MODELS)
def test_external_launcher_aclgraph(model):
def test_external_launcher(model):
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
@@ -147,42 +107,3 @@ def test_external_launcher_dense(model):
assert "TP RANKS: [1]" in output
assert "Generated text:" in output
assert proc.returncode == 0
@pytest.mark.parametrize("model", MODELS)
def test_external_launcher_dense_eager(model):
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"2",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enforce-eager",
"--enable-sleep-mode",
"--model-weight-gib",
"20",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode()
print(output)
assert "TP RANKS: [0]" in output
assert "TP RANKS: [1]" in output
assert "Generated text:" in output
assert proc.returncode == 0