From 981a14f8d536a073cde064c6f00c1b49f2c92b4b Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 2 Dec 2025 08:54:34 +0800 Subject: [PATCH] [CI]enable chunked prefill by default (#4569) set `enable_chunked_prefill` to True for e2e test by default to keep the same behavior with vLLM - vLLM version: v0.11.2 Signed-off-by: wangxiyuan --- tests/e2e/conftest.py | 2 +- tests/e2e/multicard/test_prefix_caching.py | 1 - tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 7c44013b..5292673d 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -280,7 +280,7 @@ class VllmRunner: disable_log_stats: bool = True, tensor_parallel_size: int = 1, block_size: int = 16, - enable_chunked_prefill: bool = False, + enable_chunked_prefill: bool = True, swap_space: int = 4, enforce_eager: Optional[bool] = False, quantization: Optional[str] = None, diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py index f16c94b1..114d5d72 100644 --- a/tests/e2e/multicard/test_prefix_caching.py +++ b/tests/e2e/multicard/test_prefix_caching.py @@ -58,7 +58,6 @@ INPUT_PROMPTS = [ ] -@pytest.mark.skip(reason="Fix me, the accuracy is not correct") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [50]) def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py index aec67bc3..0902fe6d 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py @@ -118,7 +118,6 @@ def test_eagle_correctness( spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name() with VllmRunner( model_name, - enable_chunked_prefill=True, max_num_seqs=1, max_num_batched_tokens=2048, gpu_memory_utilization=0.6,