[CI] drop ascend scheduler test (#4582)

let' drop ascend scheduler test first to ensure all function works without it. - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-01 20:33:50 +08:00
parent 203b4e6777
commit 27b09ca9b9
28 changed files with 53 additions and 376 deletions
--- a/tests/e2e/multicard/test_prefix_caching.py
+++ b/tests/e2e/multicard/test_prefix_caching.py
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
        name_0="vllm_output",
        name_1="prefix_cache_output",
    )
-
-
-@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("max_tokens", [50])
-def test_prefix_cache_with_ascend_scheduler(model: str,
-                                            max_tokens: int) -> None:
-
-    with VllmRunner(model,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
-                    enforce_eager=False,
-                    max_model_len=2048,
-                    tensor_parallel_size=2,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
-
-    with VllmRunner(model,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                            'enable_prefix_caching': True,
-                        },
-                    },
-                    enforce_eager=False,
-                    max_model_len=2048,
-                    tensor_parallel_size=2,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        prefix_cache_output = vllm_model.generate_greedy(
-            INPUT_PROMPTS, max_tokens)
-
-    # TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
-    # Disable it now. Fix it or drop the ascend scheduler in the future.
-    # with VllmRunner(model,
-    #                 additional_config={
-    #                     'ascend_scheduler_config': {
-    #                         'enabled': True,
-    #                         'enable_prefix_caching': True,
-    #                         "enable_chunked_prefill": True,
-    #                     },
-    #                 },
-    #                 enforce_eager=True,
-    #                 max_model_len=2048,
-    #                 tensor_parallel_size=2,
-    #                 gpu_memory_utilization=0.7) as vllm_model:
-    #     chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
-    #         INPUT_PROMPTS, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_output,
-        outputs_1_lst=prefix_cache_output,
-        name_0="vllm_output",
-        name_1="prefix_cache_output",
-    )
-
-    # check_outputs_equal(
-    #     outputs_0_lst=chunk_prefill_prefix_cache_output,
-    #     outputs_1_lst=prefix_cache_output,
-    #     name_0="chunk_prefill_prefix_cache_output",
-    #     name_1="prefix_cache_output",
-    # )