### What this PR does / why we need it?
After https://github.com/vllm-project/vllm-ascend/pull/4113, there is no
synchronization between steps. However, in async scheduling with
aclgraph, it is possible that the CPU's record event for the current
iteration completes before the previous iteration's graph execution has
finished.
If cpu is fast enough, device will hang on event_wait in interation i+1
(assume that event_record is executed immediately on update stream of
device):
<img width="1812" height="489" alt="image"
src="https://github.com/user-attachments/assets/373fe655-afe5-4d7d-807e-b0aacf24a543"
/>
after add synchonization, record is launched after graph replay:
<img width="1803" height="466" alt="image"
src="https://github.com/user-attachments/assets/a8a68053-bd7d-49f5-a79c-9a26ef1285cc"
/>
bubble time caused by synchronization is about 85 us on G8600:
<img width="1491" height="804" alt="image"
src="https://github.com/user-attachments/assets/968611ee-f39a-4329-8150-1c4adba25dd1"
/>
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
2918c1b49c
---------
Signed-off-by: realliujiaxu <realliujiaxu@163.com>
Co-authored-by: hwhaokun <haokun0405@163.com>
173 lines
6.5 KiB
Python
173 lines
6.5 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
import pytest
|
|
from vllm import SamplingParams
|
|
|
|
from tests.e2e.conftest import VllmRunner
|
|
from tests.e2e.model_utils import check_outputs_equal
|
|
|
|
MODEL = "Qwen/Qwen3-0.6B"
|
|
|
|
|
|
@pytest.mark.parametrize("enforce_eager", [True, False])
|
|
def test_concurrent_partial_prefill(enforce_eager):
|
|
with VllmRunner(MODEL,
|
|
additional_config={
|
|
'ascend_scheduler_config': {
|
|
'enabled': True,
|
|
},
|
|
},
|
|
max_num_seqs=3,
|
|
max_num_batched_tokens=2048,
|
|
enforce_eager=enforce_eager,
|
|
max_model_len=2048,
|
|
gpu_memory_utilization=0.7) as vllm_model:
|
|
outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
|
|
3)
|
|
assert len(outputs) == 3
|
|
for output in outputs:
|
|
assert len(output.outputs) == 1
|
|
|
|
|
|
@pytest.mark.parametrize("enforce_eager", [True, False])
|
|
def test_prefix_cache_stats_is_recorded(enforce_eager):
|
|
with VllmRunner(MODEL,
|
|
additional_config={
|
|
'ascend_scheduler_config': {
|
|
'enabled': True,
|
|
},
|
|
},
|
|
max_num_seqs=3,
|
|
max_num_batched_tokens=2048,
|
|
enforce_eager=enforce_eager,
|
|
max_model_len=2048,
|
|
gpu_memory_utilization=0.7) as vllm_model:
|
|
# 17 tokens will make sure first 16 tokens are cached in a block
|
|
input_tokens = {"prompt_token_ids": [101] * 129}
|
|
_ = vllm_model.model.generate([input_tokens])
|
|
outputs = vllm_model.model.generate([input_tokens])
|
|
assert outputs[0].num_cached_tokens == 128
|
|
|
|
|
|
@pytest.mark.parametrize("max_tokens",
|
|
[4]) # cannot align results when max_tokens > 4
|
|
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
|
|
def test_chunked_prefill_with_ascend_scheduler(
|
|
max_tokens: int, chunked_prefill_token_size: int) -> None:
|
|
example_prompts = [
|
|
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
|
|
]
|
|
max_num_seqs = chunked_prefill_token_size
|
|
max_num_batched_tokens = chunked_prefill_token_size
|
|
with VllmRunner(MODEL,
|
|
additional_config={
|
|
'ascend_scheduler_config': {
|
|
'enabled': True,
|
|
'enable_chunked_prefill': True,
|
|
},
|
|
},
|
|
max_num_seqs=max_num_seqs,
|
|
max_num_batched_tokens=max_num_batched_tokens,
|
|
max_model_len=2048,
|
|
gpu_memory_utilization=0.7) as vllm_model:
|
|
chunked_prefill_output = vllm_model.generate_greedy(
|
|
example_prompts, max_tokens)
|
|
|
|
with VllmRunner(MODEL,
|
|
additional_config={
|
|
'ascend_scheduler_config': {
|
|
'enabled': True,
|
|
},
|
|
},
|
|
max_model_len=2048,
|
|
gpu_memory_utilization=0.7) as vllm_model:
|
|
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
|
|
|
check_outputs_equal(
|
|
outputs_0_lst=vllm_output,
|
|
outputs_1_lst=chunked_prefill_output,
|
|
name_0="vllm_output",
|
|
name_1="chunked_prefill_output",
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("max_tokens",
|
|
[4]) # cannot align results when max_tokens > 4
|
|
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
|
|
def test_chunked_prefill_with_scheduler_dynamic_batch(
|
|
max_tokens: int, chunked_prefill_token_size: int) -> None:
|
|
example_prompts = [
|
|
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
|
|
]
|
|
max_num_seqs = chunked_prefill_token_size
|
|
max_num_batched_tokens = chunked_prefill_token_size
|
|
with VllmRunner(MODEL,
|
|
additional_config={
|
|
'SLO_limits_for_dynamic_batch': 0,
|
|
},
|
|
max_num_seqs=max_num_seqs,
|
|
max_num_batched_tokens=max_num_batched_tokens,
|
|
max_model_len=2048,
|
|
gpu_memory_utilization=0.7) as vllm_model:
|
|
dynamic_batch_output = vllm_model.generate_greedy(
|
|
example_prompts, max_tokens)
|
|
|
|
with VllmRunner(MODEL,
|
|
additional_config={
|
|
'SLO_limits_for_dynamic_batch': -1,
|
|
},
|
|
max_model_len=2048,
|
|
gpu_memory_utilization=0.7) as vllm_model:
|
|
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
|
|
|
check_outputs_equal(
|
|
outputs_0_lst=vllm_output,
|
|
outputs_1_lst=dynamic_batch_output,
|
|
name_0="vllm_output",
|
|
name_1="chunked_prefill_output",
|
|
)
|
|
|
|
|
|
def test_async_scheduling_eager() -> None:
|
|
prompts = [
|
|
"Hello, my name is",
|
|
"The president of the United States is",
|
|
"The capital of France is",
|
|
"The future of AI is",
|
|
] * 10
|
|
sampling_params = SamplingParams(temperature=0.2,
|
|
max_tokens=10,
|
|
stop_token_ids=None)
|
|
|
|
with VllmRunner(
|
|
"Qwen/Qwen2.5-0.5B-Instruct",
|
|
max_model_len=4096,
|
|
max_num_seqs=50,
|
|
dtype="bfloat16",
|
|
gpu_memory_utilization=0.9,
|
|
async_scheduling=True,
|
|
) as vllm_model:
|
|
vllm_model.generate(prompts, sampling_params=sampling_params)
|
|
|
|
|
|
def test_async_scheduling_with_full_graph() -> None:
|
|
prompts = [
|
|
"Hello, my name is",
|
|
"The president of the United States is",
|
|
"The capital of France is",
|
|
"The future of AI is",
|
|
] * 10
|
|
sampling_params = SamplingParams(temperature=0.2,
|
|
max_tokens=10,
|
|
stop_token_ids=None)
|
|
|
|
with VllmRunner("Qwen/Qwen3-8B",
|
|
max_model_len=4096,
|
|
max_num_seqs=50,
|
|
dtype="bfloat16",
|
|
gpu_memory_utilization=0.9,
|
|
async_scheduling=True,
|
|
compilation_config={"cudagraph_mode":
|
|
"FULL"}) as vllm_model:
|
|
vllm_model.generate(prompts, sampling_params=sampling_params)
|