[CI/UT] Add test for chunk prefill and prefix cache on v1/AscendScheduler (#1505)

### What this PR does / why we need it?
Add test for chunked prefill and prefix cache on v1/AscendScheduler

Covered scenarios:
- `Qwen/Qwen3-0.6B-Base` and `deepseek-ai/DeepSeek-V2-Lite-Chat` ---
multicard CI time increased by 19 min
- `V1 + default scheduler` vs `V1 + default scheduler + enable prefix
cache`
- `V1 + Ascend scheduler` vs `V1 + Ascend scheduler + enable prefix
cache` vs `V1 + Ascend scheduler + enable prefix cache + enable chunked
prefill`
- `Qwen/Qwen3-0.6B-Base` --- singlecard CI time increased by 8 min
- `V1 + Ascend scheduler` vs `V1 + Ascend scheduler + enable chunked
prefill`

should rebase after #1498 and #1446
### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
CI passed with new added test.

Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Mengqing Cao
2025-07-02 16:57:03 +08:00
committed by GitHub
parent 6b80c5acba
commit 59237ea788
6 changed files with 241 additions and 0 deletions

View File

@@ -0,0 +1,63 @@
# SPDX-License-Identifier: Apache-2.0
"""Compare the with and without chunked prefill on AscendScheduler
It tests chunked prefill. Chunked prefill can be enabled by
`additional_config={'ascend_scheduler_config': {'enabled': True, 'enable_chunked_prefill': True,},}`.
If prefill size exceeds max_num_batched_tokens, prefill requests are chunked.
Run `pytest tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py`.
"""
import os
import pytest
from tests.conftest import VllmRunner
from tests.model_utils import check_outputs_equal
MODELS = [
"Qwen/Qwen3-0.6B-Base",
]
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", reason="only test on v1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens",
[4]) # cannot align results when max_tokens > 4
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
def test_chunked_prefill_with_ascend_scheduler(
example_prompts, model: str, max_tokens: int,
chunked_prefill_token_size: int) -> None:
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
'enable_chunked_prefill': True,
},
},
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
enforce_eager=True,
max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model:
chunked_prefill_output = vllm_model.generate_greedy(
example_prompts, max_tokens)
with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
},
},
enforce_eager=True,
max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_output,
outputs_1_lst=chunked_prefill_output,
name_0="vllm_output",
name_1="chunked_prefill_output",
)