[CI/UT] Add test for chunk prefill and prefix cache on v1/AscendScheduler (#1505)

### What this PR does / why we need it?
Add test for chunked prefill and prefix cache on v1/AscendScheduler

Covered scenarios:
- `Qwen/Qwen3-0.6B-Base` and `deepseek-ai/DeepSeek-V2-Lite-Chat` ---
multicard CI time increased by 19 min
- `V1 + default scheduler` vs `V1 + default scheduler + enable prefix
cache`
- `V1 + Ascend scheduler` vs `V1 + Ascend scheduler + enable prefix
cache` vs `V1 + Ascend scheduler + enable prefix cache + enable chunked
prefill`
- `Qwen/Qwen3-0.6B-Base` --- singlecard CI time increased by 8 min
- `V1 + Ascend scheduler` vs `V1 + Ascend scheduler + enable chunked
prefill`

should rebase after #1498 and #1446
### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
CI passed with new added test.

Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Mengqing Cao
2025-07-02 16:57:03 +08:00
committed by GitHub
parent 6b80c5acba
commit 59237ea788
6 changed files with 241 additions and 0 deletions

View File

@@ -19,6 +19,7 @@
import contextlib
import gc
import os
from typing import Any, List, Optional, Tuple, TypeVar, Union
import numpy as np
@@ -59,6 +60,9 @@ PromptImageInput = _PromptMultiModalInput[Image.Image]
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
_TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "e2e", "prompts", "example.txt")]
def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
destroy_model_parallel()
@@ -367,6 +371,20 @@ def prompt_template(request):
return PROMPT_TEMPLATES[request.param]
def _read_prompts(filename: str) -> list[str]:
with open(filename) as f:
prompts = f.readlines()
return prompts
@pytest.fixture
def example_prompts() -> list[str]:
prompts = []
for filename in _TEST_PROMPTS:
prompts += _read_prompts(filename)
return prompts
@pytest.fixture(scope="session")
def ilama_lora_files():
return snapshot_download(repo_id="jeeejeee/ilama-text2sql-spider")

View File

@@ -0,0 +1,152 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""
import os
import pytest
from tests.conftest import VllmRunner
from tests.model_utils import check_outputs_equal
MODELS = [
# for MHA
"Qwen/Qwen3-8B-Base",
# for MLA
"deepseek-ai/DeepSeek-V2-Lite-Chat"
]
# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
| ID | Name | Age | Occupation | Country | Email | Phone Number | Address |
|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL |
| 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON |
| 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK |
| 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW |
| 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ |
| 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE |
| 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY |
| 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC |
| 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK |
| 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC|
| 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ |
| 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE |
| 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA |
| 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB |
| 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK |
| 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD |
| 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ |
| 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE |
| 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA |
| 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON |
| 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK |
| 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA |
| 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ|
| 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE |
| 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO |
| 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC |
| 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK |
| 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA |
| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ |
| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE |
"""
INPUT_PROMPTS = [
LONG_PROMPT +
"Question: what is the age of John Doe? Your answer: The age of John Doe is ",
LONG_PROMPT +
"Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is "
]
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
reason="mtp is not supported on v1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50])
def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
with VllmRunner(model,
enforce_eager=True,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)
with VllmRunner(model,
enable_prefix_caching=False,
enforce_eager=True,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_output,
outputs_1_lst=prefix_cache_output,
name_0="vllm_output",
name_1="prefix_cache_output",
)
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
reason="mtp is not supported on v1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50])
def test_prefix_cache_with_ascend_scheduler(model: str,
max_tokens: int) -> None:
with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
enforce_eager=True,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
'enable_prefix_caching': True,
},
},
enforce_eager=True,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)
with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
'enable_prefix_caching': True,
"enable_chunked_prefill": True,
},
},
enforce_eager=True,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_output,
outputs_1_lst=prefix_cache_output,
name_0="vllm_output",
name_1="prefix_cache_output",
)
check_outputs_equal(
outputs_0_lst=chunk_prefill_prefix_cache_output,
outputs_1_lst=prefix_cache_output,
name_0="chunk_prefill_prefix_cache_output",
name_1="prefix_cache_output",
)

View File

@@ -0,0 +1,8 @@
vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
Compare and contrast artificial intelligence with human intelligence in terms of processing information.
Describe the basic components of a neural network and how it can be trained.
Write a short story about a robot that dreams for the first time.
Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'

View File

@@ -0,0 +1,63 @@
# SPDX-License-Identifier: Apache-2.0
"""Compare the with and without chunked prefill on AscendScheduler
It tests chunked prefill. Chunked prefill can be enabled by
`additional_config={'ascend_scheduler_config': {'enabled': True, 'enable_chunked_prefill': True,},}`.
If prefill size exceeds max_num_batched_tokens, prefill requests are chunked.
Run `pytest tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py`.
"""
import os
import pytest
from tests.conftest import VllmRunner
from tests.model_utils import check_outputs_equal
MODELS = [
"Qwen/Qwen3-0.6B-Base",
]
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", reason="only test on v1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens",
[4]) # cannot align results when max_tokens > 4
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
def test_chunked_prefill_with_ascend_scheduler(
example_prompts, model: str, max_tokens: int,
chunked_prefill_token_size: int) -> None:
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
'enable_chunked_prefill': True,
},
},
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
enforce_eager=True,
max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model:
chunked_prefill_output = vllm_model.generate_greedy(
example_prompts, max_tokens)
with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
enforce_eager=True,
max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_output,
outputs_1_lst=chunked_prefill_output,
name_0="vllm_output",
name_1="chunked_prefill_output",
)