ACLgraph enable: Test cases revisions for all features (#3388)

### What this PR does / why we need it? This PR revise the test cases of various features on the warehouse which add the enablement of aclgraph to the test cases. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ut - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: lilinsiman <lilinsiman@gmail.com>
2025-10-17 17:15:19 +08:00
parent bf87606932
commit 1b424fb7f1
17 changed files with 34 additions and 117 deletions
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
@@ -71,7 +71,7 @@ def test_ngram_correctness(
    should be the same when using ngram speculative decoding.
    '''
    pytest.skip("Not current support for the test.")
-    ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
+    ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=False)
    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
    del ref_llm
    with VllmRunner(model_name,
@@ -82,7 +82,7 @@ def test_ngram_correctness(
                        "num_speculative_tokens": 3,
                    },
                    max_model_len=1024,
-                    enforce_eager=True) as runner:
+                    enforce_eager=False) as runner:
        spec_outputs = runner.model.chat(test_prompts, sampling_config)
    matches = 0
    misses = 0
@@ -111,7 +111,7 @@ def test_eagle_correctness(
    should be the same when using eagle speculative decoding.
    '''

-    ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
+    ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
    del ref_llm

@@ -129,7 +129,7 @@ def test_eagle_correctness(
                "max_model_len": 128,
            },
            max_model_len=128,
-            enforce_eager=True,
+            enforce_eager=False,
    ) as runner:
        spec_outputs = runner.model.chat(test_prompts, sampling_config)

--- a/tests/e2e/singlecard/test_ascend_scheduler.py
+++ b/tests/e2e/singlecard/test_ascend_scheduler.py
@@ -9,7 +9,8 @@ from tests.e2e.model_utils import check_outputs_equal
 MODEL = "Qwen/Qwen3-0.6B"


-def test_concurrent_partial_prefill():
+@pytest.mark.parametrize("enforce_eager", [True, False])
+def test_concurrent_partial_prefill(enforce_eager):
    with VllmRunner(MODEL,
                    additional_config={
                        'ascend_scheduler_config': {
@@ -18,7 +19,7 @@ def test_concurrent_partial_prefill():
                    },
                    max_num_seqs=3,
                    max_num_batched_tokens=2048,
-                    enforce_eager=True,
+                    enforce_eager=enforce_eager,
                    max_model_len=2048,
                    gpu_memory_utilization=0.7) as vllm_model:
        outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
@@ -28,7 +29,8 @@ def test_concurrent_partial_prefill():
            assert len(output.outputs) == 1


-def test_prefix_cache_stats_is_recorded():
+@pytest.mark.parametrize("enforce_eager", [True, False])
+def test_prefix_cache_stats_is_recorded(enforce_eager):
    with VllmRunner(MODEL,
                    additional_config={
                        'ascend_scheduler_config': {
@@ -37,7 +39,7 @@ def test_prefix_cache_stats_is_recorded():
                    },
                    max_num_seqs=3,
                    max_num_batched_tokens=2048,
-                    enforce_eager=True,
+                    enforce_eager=enforce_eager,
                    max_model_len=2048,
                    gpu_memory_utilization=0.7) as vllm_model:
        # 17 tokens will make sure first 16 tokens are cached in a block
--- a/tests/e2e/singlecard/test_camem.py
+++ b/tests/e2e/singlecard/test_camem.py
@@ -74,7 +74,7 @@ def test_end_to_end():
    sampling_params = SamplingParams(temperature=0, max_tokens=10)

    with VllmRunner("Qwen/Qwen3-0.6B",
-                    enforce_eager=True,
+                    enforce_eager=False,
                    enable_sleep_mode=True) as runner:

        output = runner.model.generate(prompt, sampling_params)
--- a/tests/e2e/singlecard/test_chunked.py
+++ b/tests/e2e/singlecard/test_chunked.py
@@ -43,12 +43,13 @@ def test_models(
        temperature=0.0,
    )

-    with VllmRunner(model, long_prefill_token_threshold=20,
-                    enforce_eager=True) as vllm_model:
+    with VllmRunner(model,
+                    long_prefill_token_threshold=20,
+                    enforce_eager=False) as vllm_model:
        output1 = vllm_model.generate(prompts, sampling_params)

    with VllmRunner(model,
-                    enforce_eager=True,
+                    enforce_eager=False,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True
--- a/tests/e2e/singlecard/test_embedding.py
+++ b/tests/e2e/singlecard/test_embedding.py
@@ -29,7 +29,7 @@ def test_embed_models_correctness():
    with VllmRunner(
            model_name,
            task="embed",
-            enforce_eager=True,
+            enforce_eager=False,
    ) as vllm_runner:
        vllm_outputs = vllm_runner.encode(queries)

--- a/tests/e2e/singlecard/test_ilama_lora.py
+++ b/tests/e2e/singlecard/test_ilama_lora.py
@@ -51,7 +51,7 @@ def test_ilama_lora(ilama_lora_files):
                    max_loras=4,
                    max_model_len=1024,
                    max_num_seqs=16,
-                    enforce_eager=True) as vllm_model:
+                    enforce_eager=False) as vllm_model:

        output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1)
        for i in range(len(EXPECTED_LORA_OUTPUT)):
--- a/tests/e2e/singlecard/test_quantization.py
+++ b/tests/e2e/singlecard/test_quantization.py
@@ -28,7 +28,7 @@ def test_quant_W8A8():
    with VllmRunner(
            snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"),
            max_model_len=8192,
-            enforce_eager=True,
+            enforce_eager=False,
            gpu_memory_utilization=0.7,
            quantization="ascend",
    ) as vllm_model:
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -46,7 +46,7 @@ def test_multimodal_vl(prompt_template):
                        "max_pixels": 1280 * 28 * 28,
                        "fps": 1,
                    },
-                    enforce_eager=True) as vllm_model:
+                    enforce_eager=False) as vllm_model:
        outputs = vllm_model.generate_greedy(prompts=prompts,
                                             images=images,
                                             max_tokens=64)