[CI] drop ascend scheduler test (#4582)

let' drop ascend scheduler test first to ensure all function works without it. - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-01 20:33:50 +08:00
parent 203b4e6777
commit 27b09ca9b9
28 changed files with 53 additions and 376 deletions
--- a/tests/e2e/multicard/test_expert_parallel.py
+++ b/tests/e2e/multicard/test_expert_parallel.py
@@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
    max_tokens = 5

    # FIXME: Really strange that chunked prefill might lead to different results, investigate further
-    with VllmRunner(
-            model_name,
-            tensor_parallel_size=2,
-            additional_config={"ascend_scheduler_config": {
-                "enabled": True
-            }},
-            enforce_eager=False) as vllm_model:
+    with VllmRunner(model_name, tensor_parallel_size=2,
+                    enforce_eager=False) as vllm_model:
        tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)

-    with VllmRunner(
-            model_name,
-            tensor_parallel_size=2,
-            enable_expert_parallel=True,
-            additional_config={"ascend_scheduler_config": {
-                "enabled": True
-            }},
-            enforce_eager=False) as vllm_model:
+    with VllmRunner(model_name,
+                    tensor_parallel_size=2,
+                    enable_expert_parallel=True,
+                    enforce_eager=False) as vllm_model:
        ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)

    check_outputs_equal(
--- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py
+++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
@@ -49,13 +49,7 @@ def test_generate_with_allgather():
                    tensor_parallel_size=2,
                    max_model_len=1024,
                    dtype="auto",
-                    enable_expert_parallel=True,
-                    additional_config={
-                        "ascend_scheduler_config": {
-                            "enabled": True,
-                            "chunked_prefill_enabled": False,
-                        },
-                    }) as vllm_model:
+                    enable_expert_parallel=True) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)


@@ -76,11 +70,5 @@ def test_generate_with_alltoall():
                    tensor_parallel_size=2,
                    max_model_len=1024,
                    dtype="auto",
-                    enable_expert_parallel=True,
-                    additional_config={
-                        "ascend_scheduler_config": {
-                            "enabled": True,
-                            "chunked_prefill_enabled": False,
-                        },
-                    }) as vllm_model:
+                    enable_expert_parallel=True) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
                    "enabled": True,
                },
                "enable_multistream_moe": True,
-                "ascend_scheduler_config": {
-                    "enabled": True,
-                },
                "refresh": True,
            },
    ) as vllm_model:
@@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
            quantization="ascend",
            enforce_eager=True,
            enable_expert_parallel=True,
-            additional_config={
-                "torchair_graph_config": {
-                    "enabled": False,
-                },
-                "ascend_scheduler_config": {
-                    "enabled": True,
-                }
-            },
+            additional_config={"torchair_graph_config": {
+                "enabled": False,
+            }},
    ) as vllm_model:
        vllm_model.generate_greedy(prompts, max_tokens)

--- a/tests/e2e/multicard/test_prefix_caching.py
+++ b/tests/e2e/multicard/test_prefix_caching.py
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
        name_0="vllm_output",
        name_1="prefix_cache_output",
    )
-
-
-@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("max_tokens", [50])
-def test_prefix_cache_with_ascend_scheduler(model: str,
-                                            max_tokens: int) -> None:
-
-    with VllmRunner(model,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
-                    enforce_eager=False,
-                    max_model_len=2048,
-                    tensor_parallel_size=2,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
-
-    with VllmRunner(model,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                            'enable_prefix_caching': True,
-                        },
-                    },
-                    enforce_eager=False,
-                    max_model_len=2048,
-                    tensor_parallel_size=2,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        prefix_cache_output = vllm_model.generate_greedy(
-            INPUT_PROMPTS, max_tokens)
-
-    # TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
-    # Disable it now. Fix it or drop the ascend scheduler in the future.
-    # with VllmRunner(model,
-    #                 additional_config={
-    #                     'ascend_scheduler_config': {
-    #                         'enabled': True,
-    #                         'enable_prefix_caching': True,
-    #                         "enable_chunked_prefill": True,
-    #                     },
-    #                 },
-    #                 enforce_eager=True,
-    #                 max_model_len=2048,
-    #                 tensor_parallel_size=2,
-    #                 gpu_memory_utilization=0.7) as vllm_model:
-    #     chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
-    #         INPUT_PROMPTS, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_output,
-        outputs_1_lst=prefix_cache_output,
-        name_0="vllm_output",
-        name_1="prefix_cache_output",
-    )
-
-    # check_outputs_equal(
-    #     outputs_0_lst=chunk_prefill_prefix_cache_output,
-    #     outputs_1_lst=prefix_cache_output,
-    #     name_0="chunk_prefill_prefix_cache_output",
-    #     name_1="prefix_cache_output",
-    # )
--- a/tests/e2e/multicard/test_qwen3_next.py
+++ b/tests/e2e/multicard/test_qwen3_next.py
@@ -89,12 +89,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
                    gpu_memory_utilization=0.8,
                    distributed_executor_backend="mp",
                    enforce_eager=True,
-                    additional_config={
-                        "ascend_scheduler_config": {
-                            "enabled": True,
-                            "enable_chunked_prefill": False
-                        }
-                    },
                    speculative_config={
                        "method": "qwen3_next_mtp",
                        "num_speculative_tokens": 1
--- a/tests/e2e/multicard/test_torchair_graph_mode.py
+++ b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
    kwargs = {}
    if not use_v1_schduler:
        kwargs = {
-            "ascend_scheduler_config": {
-                "enabled": True,
-            },
            "refresh": True,
        }
    additional_config.update(**kwargs)
@@ -121,9 +118,6 @@ def _pangu_torchair_test_fixture(

    # torchair is only work without chunked-prefill now
    kwargs = {
-        "ascend_scheduler_config": {
-            "enabled": True,
-        },
        "refresh": True,
    }
    additional_config.update(**kwargs)
@@ -186,9 +180,6 @@ def _qwen_torchair_test_fixture(
        "torchair_graph_config": {
            "enabled": False,
        },
-        "ascend_scheduler_config": {
-            "enabled": True,
-        },
        "refresh": True,
    }

@@ -245,9 +236,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
    kwargs = {}
    if not use_v1_schduler:
        kwargs = {
-            "ascend_scheduler_config": {
-                "enable": True,
-            },
            "refresh": True,
        }
    additional_config.update(**kwargs)