[CI/UT] Add test for chunk prefill and prefix cache on v1/AscendScheduler (#1505)

### What this PR does / why we need it? Add test for chunked prefill and prefix cache on v1/AscendScheduler Covered scenarios: - `Qwen/Qwen3-0.6B-Base` and `deepseek-ai/DeepSeek-V2-Lite-Chat` --- multicard CI time increased by 19 min - `V1 + default scheduler` vs `V1 + default scheduler + enable prefix cache` - `V1 + Ascend scheduler` vs `V1 + Ascend scheduler + enable prefix cache` vs `V1 + Ascend scheduler + enable prefix cache + enable chunked prefill` - `Qwen/Qwen3-0.6B-Base` --- singlecard CI time increased by 8 min - `V1 + Ascend scheduler` vs `V1 + Ascend scheduler + enable chunked prefill` should rebase after #1498 and #1446 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with new added test. Signed-off-by: MengqingCao <cmq0113@163.com>
2025-07-02 16:57:03 +08:00
parent 6b80c5acba
commit 59237ea788
6 changed files with 241 additions and 0 deletions
--- a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler.py
+++ b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler.py
--- a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
+++ b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
--- a/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py
+++ b/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the with and without chunked prefill on AscendScheduler
+
+It tests chunked prefill. Chunked prefill can be enabled by 
+`additional_config={'ascend_scheduler_config': {'enabled': True, 'enable_chunked_prefill': True,},}`. 
+If prefill size exceeds max_num_batched_tokens, prefill requests are chunked.
+
+Run `pytest tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py`.
+"""
+import os
+
+import pytest
+
+from tests.conftest import VllmRunner
+from tests.model_utils import check_outputs_equal
+
+MODELS = [
+    "Qwen/Qwen3-0.6B-Base",
+]
+
+
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", reason="only test on v1")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens",
+                         [4])  # cannot align results when max_tokens > 4
+@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+def test_chunked_prefill_with_ascend_scheduler(
+        example_prompts, model: str, max_tokens: int,
+        chunked_prefill_token_size: int) -> None:
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+    with VllmRunner(model,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                            'enable_chunked_prefill': True,
+                        },
+                    },
+                    max_num_seqs=max_num_seqs,
+                    max_num_batched_tokens=max_num_batched_tokens,
+                    enforce_eager=True,
+                    max_model_len=2048,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        chunked_prefill_output = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with VllmRunner(model,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                        },
+                    },
+                    enforce_eager=True,
+                    max_model_len=2048,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_output,
+        outputs_1_lst=chunked_prefill_output,
+        name_0="vllm_output",
+        name_1="chunked_prefill_output",
+    )