Revert "drop ascend scheduler" (#4580)

Reverts vllm-project/vllm-ascend#4498 - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2
2025-11-29 22:20:48 +08:00
parent 4dbe4fd123
commit 517fd9272d
52 changed files with 2948 additions and 85 deletions
--- a/tests/e2e/multicard/test_expert_parallel.py
+++ b/tests/e2e/multicard/test_expert_parallel.py
@@ -15,14 +15,23 @@ def test_e2e_ep_correctness(model_name):
    max_tokens = 5

    # FIXME: Really strange that chunked prefill might lead to different results, investigate further
-    with VllmRunner(model_name, tensor_parallel_size=2,
-                    enforce_eager=False) as vllm_model:
+    with VllmRunner(
+            model_name,
+            tensor_parallel_size=2,
+            additional_config={"ascend_scheduler_config": {
+                "enabled": True
+            }},
+            enforce_eager=False) as vllm_model:
        tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)

-    with VllmRunner(model_name,
-                    tensor_parallel_size=2,
-                    enable_expert_parallel=True,
-                    enforce_eager=False) as vllm_model:
+    with VllmRunner(
+            model_name,
+            tensor_parallel_size=2,
+            enable_expert_parallel=True,
+            additional_config={"ascend_scheduler_config": {
+                "enabled": True
+            }},
+            enforce_eager=False) as vllm_model:
        ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)

    check_outputs_equal(
--- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py
+++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
@@ -49,7 +49,13 @@ def test_generate_with_allgather():
                    tensor_parallel_size=2,
                    max_model_len=1024,
                    dtype="auto",
-                    enable_expert_parallel=True) as vllm_model:
+                    enable_expert_parallel=True,
+                    additional_config={
+                        "ascend_scheduler_config": {
+                            "enabled": True,
+                            "chunked_prefill_enabled": False,
+                        },
+                    }) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)


@@ -70,5 +76,11 @@ def test_generate_with_alltoall():
                    tensor_parallel_size=2,
                    max_model_len=1024,
                    dtype="auto",
-                    enable_expert_parallel=True) as vllm_model:
+                    enable_expert_parallel=True,
+                    additional_config={
+                        "ascend_scheduler_config": {
+                            "enabled": True,
+                            "chunked_prefill_enabled": False,
+                        },
+                    }) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -82,6 +82,9 @@ def test_models_distributed_DeepSeek_multistream_moe():
                    "enabled": True,
                },
                "enable_multistream_moe": True,
+                "ascend_scheduler_config": {
+                    "enabled": True,
+                },
                "refresh": True,
            },
    ) as vllm_model:
@@ -151,9 +154,14 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
            quantization="ascend",
            enforce_eager=True,
            enable_expert_parallel=True,
-            additional_config={"torchair_graph_config": {
-                "enabled": False,
-            }},
+            additional_config={
+                "torchair_graph_config": {
+                    "enabled": False,
+                },
+                "ascend_scheduler_config": {
+                    "enabled": True,
+                }
+            },
    ) as vllm_model:
        vllm_model.generate_greedy(prompts, max_tokens)

--- a/tests/e2e/multicard/test_prefix_caching.py
+++ b/tests/e2e/multicard/test_prefix_caching.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the with and without prefix caching on V1 scheduler."""
+"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""

 import pytest

@@ -84,3 +84,67 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
        name_0="vllm_output",
        name_1="prefix_cache_output",
    )
+
+
+@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [50])
+def test_prefix_cache_with_ascend_scheduler(model: str,
+                                            max_tokens: int) -> None:
+
+    with VllmRunner(model,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                        },
+                    },
+                    enforce_eager=False,
+                    max_model_len=2048,
+                    tensor_parallel_size=2,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
+
+    with VllmRunner(model,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                            'enable_prefix_caching': True,
+                        },
+                    },
+                    enforce_eager=False,
+                    max_model_len=2048,
+                    tensor_parallel_size=2,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        prefix_cache_output = vllm_model.generate_greedy(
+            INPUT_PROMPTS, max_tokens)
+
+    # TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
+    # Disable it now. Fix it or drop the ascend scheduler in the future.
+    # with VllmRunner(model,
+    #                 additional_config={
+    #                     'ascend_scheduler_config': {
+    #                         'enabled': True,
+    #                         'enable_prefix_caching': True,
+    #                         "enable_chunked_prefill": True,
+    #                     },
+    #                 },
+    #                 enforce_eager=True,
+    #                 max_model_len=2048,
+    #                 tensor_parallel_size=2,
+    #                 gpu_memory_utilization=0.7) as vllm_model:
+    #     chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
+    #         INPUT_PROMPTS, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_output,
+        outputs_1_lst=prefix_cache_output,
+        name_0="vllm_output",
+        name_1="prefix_cache_output",
+    )
+
+    # check_outputs_equal(
+    #     outputs_0_lst=chunk_prefill_prefix_cache_output,
+    #     outputs_1_lst=prefix_cache_output,
+    #     name_0="chunk_prefill_prefix_cache_output",
+    #     name_1="prefix_cache_output",
+    # )
--- a/tests/e2e/multicard/test_qwen3_next.py
+++ b/tests/e2e/multicard/test_qwen3_next.py
@@ -24,7 +24,6 @@ Run `pytest tests/e2e/multicard/test_qwen3_next.py`.
 import os
 from unittest.mock import patch

-import pytest
 from modelscope import snapshot_download  # type: ignore

 from tests.e2e.conftest import VllmRunner
@@ -64,8 +63,6 @@ def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
        del vllm_model


-@pytest.mark.skip(
-    reason="Qwen3-Next + MTP doesn't work with chunked prefill. Fix Me")
 def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
    example_prompts = [
        "Hello, my name is",
@@ -92,6 +89,12 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
                    gpu_memory_utilization=0.8,
                    distributed_executor_backend="mp",
                    enforce_eager=True,
+                    additional_config={
+                        "ascend_scheduler_config": {
+                            "enabled": True,
+                            "enable_chunked_prefill": False
+                        }
+                    },
                    speculative_config={
                        "method": "qwen3_next_mtp",
                        "num_speculative_tokens": 1
--- a/tests/e2e/multicard/test_torchair_graph_mode.py
+++ b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -44,6 +44,9 @@ def _deepseek_torchair_test_fixture(
    kwargs = {}
    if not use_v1_schduler:
        kwargs = {
+            "ascend_scheduler_config": {
+                "enabled": True,
+            },
            "refresh": True,
        }
    additional_config.update(**kwargs)
@@ -117,6 +120,9 @@ def _pangu_torchair_test_fixture(

    # torchair is only work without chunked-prefill now
    kwargs = {
+        "ascend_scheduler_config": {
+            "enabled": True,
+        },
        "refresh": True,
    }
    additional_config.update(**kwargs)
@@ -179,6 +185,9 @@ def _qwen_torchair_test_fixture(
        "torchair_graph_config": {
            "enabled": False,
        },
+        "ascend_scheduler_config": {
+            "enabled": True,
+        },
        "refresh": True,
    }

@@ -235,6 +244,9 @@ def _deepseek_v2_lite_torchair_test_fixure(
    kwargs = {}
    if not use_v1_schduler:
        kwargs = {
+            "ascend_scheduler_config": {
+                "enable": True,
+            },
            "refresh": True,
        }
    additional_config.update(**kwargs)