[CI]enable chunked prefill by default (#4569)

set `enable_chunked_prefill` to True for e2e test by default to keep the same behavior with vLLM - vLLM version: v0.11.2 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-02 08:54:34 +08:00
parent 6b9a997076
commit 981a14f8d5
3 changed files with 1 additions and 3 deletions
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -280,7 +280,7 @@ class VllmRunner:
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
        block_size: int = 16,
-        enable_chunked_prefill: bool = False,
+        enable_chunked_prefill: bool = True,
        swap_space: int = 4,
        enforce_eager: Optional[bool] = False,
        quantization: Optional[str] = None,
--- a/tests/e2e/multicard/test_prefix_caching.py
+++ b/tests/e2e/multicard/test_prefix_caching.py
@@ -58,7 +58,6 @@ INPUT_PROMPTS = [
 ]


-@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50])
 def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
@@ -118,7 +118,6 @@ def test_eagle_correctness(
    spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
    with VllmRunner(
            model_name,
-            enable_chunked_prefill=True,
            max_num_seqs=1,
            max_num_batched_tokens=2048,
            gpu_memory_utilization=0.6,