Revert "drop ascend scheduler" (#4580)

Reverts vllm-project/vllm-ascend#4498 - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2
2025-11-29 22:20:48 +08:00
parent 4dbe4fd123
commit 517fd9272d
52 changed files with 2948 additions and 85 deletions
--- a/tests/e2e/multicard/test_prefix_caching.py
+++ b/tests/e2e/multicard/test_prefix_caching.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the with and without prefix caching on V1 scheduler."""
+"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""

 import pytest

@@ -84,3 +84,67 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
        name_0="vllm_output",
        name_1="prefix_cache_output",
    )
+
+
+@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [50])
+def test_prefix_cache_with_ascend_scheduler(model: str,
+                                            max_tokens: int) -> None:
+
+    with VllmRunner(model,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                        },
+                    },
+                    enforce_eager=False,
+                    max_model_len=2048,
+                    tensor_parallel_size=2,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
+
+    with VllmRunner(model,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                            'enable_prefix_caching': True,
+                        },
+                    },
+                    enforce_eager=False,
+                    max_model_len=2048,
+                    tensor_parallel_size=2,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        prefix_cache_output = vllm_model.generate_greedy(
+            INPUT_PROMPTS, max_tokens)
+
+    # TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
+    # Disable it now. Fix it or drop the ascend scheduler in the future.
+    # with VllmRunner(model,
+    #                 additional_config={
+    #                     'ascend_scheduler_config': {
+    #                         'enabled': True,
+    #                         'enable_prefix_caching': True,
+    #                         "enable_chunked_prefill": True,
+    #                     },
+    #                 },
+    #                 enforce_eager=True,
+    #                 max_model_len=2048,
+    #                 tensor_parallel_size=2,
+    #                 gpu_memory_utilization=0.7) as vllm_model:
+    #     chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
+    #         INPUT_PROMPTS, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_output,
+        outputs_1_lst=prefix_cache_output,
+        name_0="vllm_output",
+        name_1="prefix_cache_output",
+    )
+
+    # check_outputs_equal(
+    #     outputs_0_lst=chunk_prefill_prefix_cache_output,
+    #     outputs_1_lst=prefix_cache_output,
+    #     name_0="chunk_prefill_prefix_cache_output",
+    #     name_1="prefix_cache_output",
+    # )