From 27b09ca9b9515a271683a626cec85d33534520b4 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Mon, 1 Dec 2025 20:33:50 +0800
Subject: [PATCH] [CI] drop ascend scheduler test (#4582)

let' drop ascend scheduler test first to ensure all function works
without it.


- vLLM version: v0.11.2
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 .github/workflows/_e2e_test.yaml              |  1 -
 .../test_offline_inference_parallel_310p.py   |  3 -
 tests/e2e/multicard/test_expert_parallel.py   | 21 ++---
 .../multicard/test_fused_moe_allgather_ep.py  | 16 +---
 .../test_offline_inference_distributed.py     | 14 +---
 tests/e2e/multicard/test_prefix_caching.py    | 64 ---------------
 tests/e2e/multicard/test_qwen3_next.py        |  6 --
 .../e2e/multicard/test_torchair_graph_mode.py | 12 ---
 .../test_mtpx_deepseek_r1_0528_w8a8.py        |  6 +-
 ...test_prefix_cache_deepseek_r1_0528_w8a8.py |  3 -
 .../test_prefix_cache_qwen3_32b_int8.py       |  7 +-
 .../test_qwen3_32b_int8_a3_feature_stack3.py  |  3 +-
 .../models/test_deepseek_r1_0528_w8a8.py      | 10 +--
 .../models/test_deepseek_r1_w8a8_eplb.py      |  3 -
 .../models/test_deepseek_v3_2_exp_w8a8.py     |  3 +-
 .../e2e/nightly/models/test_qwen2_5_vl_32b.py |  5 +-
 .../models/test_qwen3_235b_a22b_w8a8_eplb.py  |  6 +-
 .../nightly/models/test_qwen3_235b_w8a8.py    |  6 --
 tests/e2e/nightly/models/test_qwq_32b.py      |  2 -
 .../models/DeepSeek-R1-W8A8-A2-torchair.yaml  |  4 +-
 .../config/models/DeepSeek-R1-W8A8-A2.yaml    |  4 +-
 .../config/models/DeepSeek-R1-W8A8-EPLB.yaml  |  8 +-
 .../config/models/DeepSeek-R1-W8A8.yaml       |  8 +-
 .../config/models/DeepSeek-V3_2-Exp-bf16.yaml |  4 +-
 .../spec_decode_v1/test_v1_mtp_correctness.py | 41 +++++-----
 tests/e2e/singlecard/test_ascend_scheduler.py | 52 ------------
 tests/e2e/singlecard/test_chunked.py          | 82 -------------------
 tests/e2e/singlecard/test_vlm.py              | 35 --------
 28 files changed, 53 insertions(+), 376 deletions(-)
 delete mode 100644 tests/e2e/singlecard/test_chunked.py

diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
index c07906ba..c7e883a0 100644
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -94,7 +94,6 @@ jobs:
           pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
           pytest -sv tests/e2e/singlecard/test_bge_model.py
           pytest -sv tests/e2e/singlecard/test_camem.py
-          pytest -sv tests/e2e/singlecard/test_chunked.py
           pytest -sv tests/e2e/singlecard/test_embedding.py
           # pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
           pytest -sv tests/e2e/singlecard/test_guided_decoding.py
diff --git a/tests/e2e/310p/test_offline_inference_parallel_310p.py b/tests/e2e/310p/test_offline_inference_parallel_310p.py
index 6bf33568..7ba7ef73 100644
--- a/tests/e2e/310p/test_offline_inference_parallel_310p.py
+++ b/tests/e2e/310p/test_offline_inference_parallel_310p.py
@@ -29,9 +29,6 @@ ADDITIONAL_CONFIG = [{
     "additional_config": {
         "torchair_graph_config": {
             "enabled": True
-        },
-        "ascend_scheduler_config": {
-            "enabled": True,
         }
     }
 }]
diff --git a/tests/e2e/multicard/test_expert_parallel.py b/tests/e2e/multicard/test_expert_parallel.py
index f1076013..b8f03d5f 100644
--- a/tests/e2e/multicard/test_expert_parallel.py
+++ b/tests/e2e/multicard/test_expert_parallel.py
@@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
     max_tokens = 5
 
     # FIXME: Really strange that chunked prefill might lead to different results, investigate further
-    with VllmRunner(
-            model_name,
-            tensor_parallel_size=2,
-            additional_config={"ascend_scheduler_config": {
-                "enabled": True
-            }},
-            enforce_eager=False) as vllm_model:
+    with VllmRunner(model_name, tensor_parallel_size=2,
+                    enforce_eager=False) as vllm_model:
         tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    with VllmRunner(
-            model_name,
-            tensor_parallel_size=2,
-            enable_expert_parallel=True,
-            additional_config={"ascend_scheduler_config": {
-                "enabled": True
-            }},
-            enforce_eager=False) as vllm_model:
+    with VllmRunner(model_name,
+                    tensor_parallel_size=2,
+                    enable_expert_parallel=True,
+                    enforce_eager=False) as vllm_model:
         ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(
diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
index 9335e19a..85d246e5 100644
--- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py
+++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
@@ -49,13 +49,7 @@ def test_generate_with_allgather():
                     tensor_parallel_size=2,
                     max_model_len=1024,
                     dtype="auto",
-                    enable_expert_parallel=True,
-                    additional_config={
-                        "ascend_scheduler_config": {
-                            "enabled": True,
-                            "chunked_prefill_enabled": False,
-                        },
-                    }) as vllm_model:
+                    enable_expert_parallel=True) as vllm_model:
         vllm_model.generate(example_prompts, sampling_params)
 
 
@@ -76,11 +70,5 @@ def test_generate_with_alltoall():
                     tensor_parallel_size=2,
                     max_model_len=1024,
                     dtype="auto",
-                    enable_expert_parallel=True,
-                    additional_config={
-                        "ascend_scheduler_config": {
-                            "enabled": True,
-                            "chunked_prefill_enabled": False,
-                        },
-                    }) as vllm_model:
+                    enable_expert_parallel=True) as vllm_model:
         vllm_model.generate(example_prompts, sampling_params)
diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
index 320c3bdf..1380c49e 100644
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
                     "enabled": True,
                 },
                 "enable_multistream_moe": True,
-                "ascend_scheduler_config": {
-                    "enabled": True,
-                },
                 "refresh": True,
             },
     ) as vllm_model:
@@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
             quantization="ascend",
             enforce_eager=True,
             enable_expert_parallel=True,
-            additional_config={
-                "torchair_graph_config": {
-                    "enabled": False,
-                },
-                "ascend_scheduler_config": {
-                    "enabled": True,
-                }
-            },
+            additional_config={"torchair_graph_config": {
+                "enabled": False,
+            }},
     ) as vllm_model:
         vllm_model.generate_greedy(prompts, max_tokens)
 
diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py
index e2991662..f16c94b1 100644
--- a/tests/e2e/multicard/test_prefix_caching.py
+++ b/tests/e2e/multicard/test_prefix_caching.py
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
         name_0="vllm_output",
         name_1="prefix_cache_output",
     )
-
-
-@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("max_tokens", [50])
-def test_prefix_cache_with_ascend_scheduler(model: str,
-                                            max_tokens: int) -> None:
-
-    with VllmRunner(model,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
-                    enforce_eager=False,
-                    max_model_len=2048,
-                    tensor_parallel_size=2,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
-
-    with VllmRunner(model,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                            'enable_prefix_caching': True,
-                        },
-                    },
-                    enforce_eager=False,
-                    max_model_len=2048,
-                    tensor_parallel_size=2,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        prefix_cache_output = vllm_model.generate_greedy(
-            INPUT_PROMPTS, max_tokens)
-
-    # TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
-    # Disable it now. Fix it or drop the ascend scheduler in the future.
-    # with VllmRunner(model,
-    #                 additional_config={
-    #                     'ascend_scheduler_config': {
-    #                         'enabled': True,
-    #                         'enable_prefix_caching': True,
-    #                         "enable_chunked_prefill": True,
-    #                     },
-    #                 },
-    #                 enforce_eager=True,
-    #                 max_model_len=2048,
-    #                 tensor_parallel_size=2,
-    #                 gpu_memory_utilization=0.7) as vllm_model:
-    #     chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
-    #         INPUT_PROMPTS, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_output,
-        outputs_1_lst=prefix_cache_output,
-        name_0="vllm_output",
-        name_1="prefix_cache_output",
-    )
-
-    # check_outputs_equal(
-    #     outputs_0_lst=chunk_prefill_prefix_cache_output,
-    #     outputs_1_lst=prefix_cache_output,
-    #     name_0="chunk_prefill_prefix_cache_output",
-    #     name_1="prefix_cache_output",
-    # )
diff --git a/tests/e2e/multicard/test_qwen3_next.py b/tests/e2e/multicard/test_qwen3_next.py
index e51748ea..eaacd838 100644
--- a/tests/e2e/multicard/test_qwen3_next.py
+++ b/tests/e2e/multicard/test_qwen3_next.py
@@ -89,12 +89,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
                     gpu_memory_utilization=0.8,
                     distributed_executor_backend="mp",
                     enforce_eager=True,
-                    additional_config={
-                        "ascend_scheduler_config": {
-                            "enabled": True,
-                            "enable_chunked_prefill": False
-                        }
-                    },
                     speculative_config={
                         "method": "qwen3_next_mtp",
                         "num_speculative_tokens": 1
diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py
index 6a488782..3472051e 100644
--- a/tests/e2e/multicard/test_torchair_graph_mode.py
+++ b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
     kwargs = {}
     if not use_v1_schduler:
         kwargs = {
-            "ascend_scheduler_config": {
-                "enabled": True,
-            },
             "refresh": True,
         }
     additional_config.update(**kwargs)
@@ -121,9 +118,6 @@ def _pangu_torchair_test_fixture(
 
     # torchair is only work without chunked-prefill now
     kwargs = {
-        "ascend_scheduler_config": {
-            "enabled": True,
-        },
         "refresh": True,
     }
     additional_config.update(**kwargs)
@@ -186,9 +180,6 @@ def _qwen_torchair_test_fixture(
         "torchair_graph_config": {
             "enabled": False,
         },
-        "ascend_scheduler_config": {
-            "enabled": True,
-        },
         "refresh": True,
     }
 
@@ -245,9 +236,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
     kwargs = {}
     if not use_v1_schduler:
         kwargs = {
-            "ascend_scheduler_config": {
-                "enable": True,
-            },
             "refresh": True,
         }
     additional_config.update(**kwargs)
diff --git a/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py
index 65d01b21..880b44ae 100644
--- a/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py
@@ -73,11 +73,7 @@ async def test_models(model: str, mode: str) -> None:
         "VLLM_RPC_TIMEOUT": "3600000",
         "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
     }
-    additional_config: dict[str, Any] = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
-    }
+    additional_config: dict[str, Any] = {}
     speculative_config = {
         "num_speculative_tokens": 2,
         "method": "deepseek_mtp"
diff --git a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py
index 8ac1883d..80157588 100644
--- a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py
@@ -74,9 +74,6 @@ async def test_models(model: str) -> None:
         "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
     }
     additional_config = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
         "torchair_graph_config": {
             "enabled": True,
             "enable_multistream_moe": False,
diff --git a/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py b/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py
index 3ee23287..fdf7167b 100644
--- a/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py
+++ b/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py
@@ -68,12 +68,7 @@ aisbench_cases75 = [{
 async def test_models(model: str) -> None:
     port = get_open_port()
     env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
-    additional_config = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
-        "enable_weight_nz_layout": True
-    }
+    additional_config = {"enable_weight_nz_layout": True}
     server_args = [
         "--quantization", "ascend", "--reasoning-parser", "qwen3",
         "--tensor-parallel-size", "4", "--port",
diff --git a/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py b/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
index 17a7f4b6..9fa2d1e5 100644
--- a/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
+++ b/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
@@ -83,8 +83,7 @@ async def test_models(model: str, tp_size: int) -> None:
         "0.9", "--block-size", "128", "--max-num-seqs", "256",
         "--enforce-eager", "--max-model-len", "35840",
         "--max-num-batched-tokens", "35840", "--additional-config",
-        '{"ascend_scheduler_config":{"enabled":true},"enable_weight_nz_layout":true}',
-        "--compilation-config",
+        '{"enable_weight_nz_layout":true}', "--compilation-config",
         '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
     ]
     with RemoteOpenAIServer(model,
diff --git a/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
index c9126577..35082edb 100644
--- a/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
@@ -33,7 +33,6 @@ MODES = [
     "single",
     "aclgraph",
     "aclgraph_mlapo",
-    "no_chunkprefill",
 ]
 
 prompts = [
@@ -82,9 +81,6 @@ async def test_models(model: str, mode: str) -> None:
         "method": "deepseek_mtp"
     }
     additional_config = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
         "torchair_graph_config": {
             "enabled": True,
             "enable_multistream_moe": False,
@@ -112,10 +108,6 @@ async def test_models(model: str, mode: str) -> None:
     if mode == "aclgraph_mlapo":
         env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
         additional_config["torchair_graph_config"] = {"enabled": False}
-    if mode == "no_chunkprefill":
-        additional_config["ascend_scheduler_config"] = {"enabled": True}
-        i = server_args.index("--max-num-batched-tokens") + 1
-        server_args[i] = "36864"
     server_args.extend(["--additional-config", json.dumps(additional_config)])
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,
@@ -134,7 +126,7 @@ async def test_models(model: str, mode: str) -> None:
         choices: list[openai.types.CompletionChoice] = batch.choices
         assert choices[0].text, "empty response"
         print(choices)
-        if mode in ["single", "no_chunkprefill"]:
+        if mode in ["single"]:
             return
         # aisbench test
         run_aisbench_cases(model,
diff --git a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
index bca2baf0..6413aba0 100644
--- a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
+++ b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
@@ -71,9 +71,6 @@ async def test_models(model: str) -> None:
         "cudagraph_mode": "FULL_DECODE_ONLY"
     }
     additional_config: dict[str, Any] = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
         "torchair_graph_config": {
             "enabled": True
         },
diff --git a/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py b/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
index 217b2786..9d5b78f0 100644
--- a/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
+++ b/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
@@ -92,8 +92,7 @@ async def test_models(model: str, tp_size: int, dp_size: int,
         "--gpu-memory-utilization",
         "0.9",
         "--additional-config",
-        '{"ascend_scheduler_config":{"enabled":true},'
-        '"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
+        '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
     ]
     if full_graph:
         server_args += [
diff --git a/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py b/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py
index fe6bbedf..77c1a7e1 100644
--- a/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py
+++ b/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py
@@ -85,9 +85,8 @@ async def test_models(model: str, tp_size: int) -> None:
         str(tp_size), "--port",
         str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
         "40000", "--max-num-seqs", "400", "--trust-remote-code",
-        "--gpu-memory-utilization", "0.8", "--additional-config",
-        '{"ascend_scheduler_config":{"enabled":false}}',
-        "--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
+        "--gpu-memory-utilization", "0.8", "--compilation_config",
+        '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
     ]
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,
diff --git a/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py b/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
index 945d7cae..efbf77d2 100644
--- a/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
+++ b/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
@@ -60,11 +60,7 @@ async def test_models(model: str) -> None:
         "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
         "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
     }
-    additional_config: dict[str, Any] = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
-    }
+    additional_config: dict[str, Any] = {}
     compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
     server_args = [
         "--quantization", "ascend", "--async-scheduling",
diff --git a/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py b/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py
index 8220e4d5..055a452e 100644
--- a/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py
+++ b/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py
@@ -63,11 +63,6 @@ async def test_models(model: str, mode: str) -> None:
         "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
         "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
     }
-    additional_config: dict[str, Any] = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
-    }
     compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
     server_args = [
         "--quantization", "ascend", "--async-scheduling",
@@ -82,7 +77,6 @@ async def test_models(model: str, mode: str) -> None:
     server_args.extend(
         ["--compilation-config",
          json.dumps(compilation_config)])
-    server_args.extend(["--additional-config", json.dumps(additional_config)])
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,
     }
diff --git a/tests/e2e/nightly/models/test_qwq_32b.py b/tests/e2e/nightly/models/test_qwq_32b.py
index a60eff22..824651ba 100644
--- a/tests/e2e/nightly/models/test_qwq_32b.py
+++ b/tests/e2e/nightly/models/test_qwq_32b.py
@@ -93,8 +93,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None:
         server_args.remove(
             '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
         )
-        server_args.append("--additional-config")
-        server_args.append('{"ascend_scheduler_config":{"enabled":true}}')
         server_args.append("--enforce-eager")
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,
diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
index 42b70f76..7bfe3f5e 100644
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
@@ -30,7 +30,7 @@ deployment:
       --quantization ascend
       --gpu-memory-utilization 0.9
       --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
 
   -
     server_cmd: >
@@ -51,7 +51,7 @@ deployment:
       --quantization ascend
       --gpu-memory-utilization 0.9
       --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
 benchmarks:
   acc:
     case_type: accuracy
diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
index cf44bc8f..01100f29 100644
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
@@ -31,7 +31,7 @@ deployment:
       --gpu-memory-utilization 0.9
       --enforce-eager
       --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
 
   -
     server_cmd: >
@@ -53,5 +53,5 @@ deployment:
       --gpu-memory-utilization 0.9
       --enforce-eager
       --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
 benchmarks:
diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml
index 9a4c3d94..6ca189c4 100644
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml
@@ -50,7 +50,7 @@ deployment:
           "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
           }'
           --additional-config
-          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
 
   -
     server_cmd: >
@@ -80,7 +80,7 @@ deployment:
           "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
           }'
           --additional-config
-          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
   -
     server_cmd: >
       vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -111,7 +111,7 @@ deployment:
         "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
         }'
         --additional-config
-        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
   -
     server_cmd: >
       vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -141,7 +141,7 @@ deployment:
         "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
         }'
         --additional-config
-        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
 benchmarks:
   perf:
     case_type: performance
diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
index a8e49290..37a024b9 100644
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
@@ -49,7 +49,7 @@ deployment:
           "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
           }'
           --additional-config
-          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
+          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
 
   -
     server_cmd: >
@@ -79,7 +79,7 @@ deployment:
           "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
           }'
           --additional-config
-          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
+          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
   -
     server_cmd: >
       vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -110,7 +110,7 @@ deployment:
         "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
         }'
         --additional-config
-        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
+        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
   -
     server_cmd: >
       vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -140,7 +140,7 @@ deployment:
         "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
         }'
         --additional-config
-        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
+        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
 benchmarks:
   perf:
     case_type: performance
diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml
index 6dafd3cc..40ac6476 100644
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml
@@ -29,7 +29,7 @@ deployment:
         --trust-remote-code 
         --no-enable-prefix-caching 
         --gpu-memory-utilization 0.9 
-        --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
+        --additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
 
   -
     server_cmd: >
@@ -49,5 +49,5 @@ deployment:
         --trust-remote-code 
         --no-enable-prefix-caching 
         --gpu-memory-utilization 0.92 
-        --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
+        --additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
 benchmarks:
diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
index 2f56d9d2..6b90ec36 100644
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
@@ -48,27 +48,26 @@ def mtp_correctness(sampling_config: SamplingParams,
     if graph_mode == CUDAGraphMode.FULL:
         graph_mode_str = "FULL_DECODE_ONLY"
 
-    with VllmRunner(
-            model_name,
-            tensor_parallel_size=1,
-            max_num_seqs=256,
-            gpu_memory_utilization=0.7,
-            distributed_executor_backend="mp",
-            enable_expert_parallel=True,
-            speculative_config={
-                "method": "deepseek_mtp",
-                "num_speculative_tokens": num_speculative_tokens,
-                "disable_padded_drafter_batch": disable_padded_drafter_batch,
-            },
-            enforce_eager=enforce_eager,
-            max_model_len=2000,
-            compilation_config=CompilationConfig(
-                cudagraph_mode=graph_mode_str,
-                cudagraph_capture_sizes=[12],
-            ),
-            additional_config={"ascend_scheduler_config": {
-                "enabled": False
-            }}) as spec_llm:
+    with VllmRunner(model_name,
+                    tensor_parallel_size=1,
+                    max_num_seqs=256,
+                    gpu_memory_utilization=0.7,
+                    distributed_executor_backend="mp",
+                    enable_expert_parallel=True,
+                    speculative_config={
+                        "method":
+                        "deepseek_mtp",
+                        "num_speculative_tokens":
+                        num_speculative_tokens,
+                        "disable_padded_drafter_batch":
+                        disable_padded_drafter_batch,
+                    },
+                    enforce_eager=enforce_eager,
+                    max_model_len=2000,
+                    compilation_config=CompilationConfig(
+                        cudagraph_mode=graph_mode_str,
+                        cudagraph_capture_sizes=[12],
+                    )) as spec_llm:
         spec_outputs = spec_llm.generate(example_prompts, sampling_config)
 
     matches = 0
diff --git a/tests/e2e/singlecard/test_ascend_scheduler.py b/tests/e2e/singlecard/test_ascend_scheduler.py
index 502a8103..0c996e4e 100644
--- a/tests/e2e/singlecard/test_ascend_scheduler.py
+++ b/tests/e2e/singlecard/test_ascend_scheduler.py
@@ -12,11 +12,6 @@ MODEL = "Qwen/Qwen3-0.6B"
 @pytest.mark.parametrize("enforce_eager", [True, False])
 def test_concurrent_partial_prefill(enforce_eager):
     with VllmRunner(MODEL,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
                     max_num_seqs=3,
                     max_num_batched_tokens=8192,
                     enforce_eager=enforce_eager,
@@ -31,11 +26,6 @@ def test_concurrent_partial_prefill(enforce_eager):
 @pytest.mark.parametrize("enforce_eager", [True, False])
 def test_prefix_cache_stats_is_recorded(enforce_eager):
     with VllmRunner(MODEL,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
                     max_num_seqs=3,
                     max_num_batched_tokens=8192,
                     enforce_eager=enforce_eager,
@@ -47,48 +37,6 @@ def test_prefix_cache_stats_is_recorded(enforce_eager):
         assert outputs[0].num_cached_tokens == 128
 
 
-@pytest.mark.parametrize("max_tokens",
-                         [4])  # cannot align results when max_tokens > 4
-@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
-def test_chunked_prefill_with_ascend_scheduler(
-        max_tokens: int, chunked_prefill_token_size: int) -> None:
-    example_prompts = [
-        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
-    ]
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-    with VllmRunner(MODEL,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                            'enable_chunked_prefill': True,
-                        },
-                    },
-                    max_num_seqs=max_num_seqs,
-                    max_num_batched_tokens=max_num_batched_tokens,
-                    max_model_len=2048,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        chunked_prefill_output = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    with VllmRunner(MODEL,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
-                    max_model_len=2048,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_output,
-        outputs_1_lst=chunked_prefill_output,
-        name_0="vllm_output",
-        name_1="chunked_prefill_output",
-    )
-
-
 @pytest.mark.parametrize("max_tokens",
                          [4])  # cannot align results when max_tokens > 4
 @pytest.mark.parametrize("chunked_prefill_token_size", [2048])
diff --git a/tests/e2e/singlecard/test_chunked.py b/tests/e2e/singlecard/test_chunked.py
deleted file mode 100644
index f6eacb71..00000000
--- a/tests/e2e/singlecard/test_chunked.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""
-Compare the outputs of vLLM with and without aclgraph.
-
-Run `pytest tests/compile/test_aclgraph.py`.
-"""
-import gc
-
-import pytest
-import torch
-from vllm import SamplingParams
-
-from tests.e2e.conftest import VllmRunner
-
-MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("max_tokens", [1])
-def test_models(
-    model: str,
-    max_tokens: int,
-) -> None:
-    prompts = ["The president of the United States is"]
-
-    sampling_params = SamplingParams(
-        max_tokens=max_tokens,
-        temperature=0.0,
-    )
-
-    with VllmRunner(model,
-                    long_prefill_token_threshold=20,
-                    enforce_eager=False) as vllm_model:
-        output1 = vllm_model.generate(prompts, sampling_params)
-
-    with VllmRunner(model,
-                    enforce_eager=False,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True
-                        },
-                    }) as vllm_model:
-        output2 = vllm_model.generate(prompts, sampling_params)
-
-    # Extract the generated token IDs for comparison
-    token_ids1 = output1[0][0][0]
-    token_ids2 = output2[0][0][0]
-
-    print(f"Token IDs 1: {token_ids1}")
-    print(f"Token IDs 2: {token_ids2}")
-
-    # Convert token IDs to tensors and calculate cosine similarity
-    # Take the length of a shorter sequence to ensure consistent dimensions
-    min_len = min(len(token_ids1), len(token_ids2))
-
-    tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32)
-    tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32)
-
-    # Calculate similarity using torch.cosine_similarity
-    similarity = torch.cosine_similarity(tensor1, tensor2, dim=0)
-    print(f"Token IDs cosine similarity: {similarity.item()}")
-
-    assert similarity > 0.95
-
-    gc.collect()
-    torch.npu.empty_cache()
-    torch.npu.reset_peak_memory_stats()
diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py
index cc3d50f8..95456679 100644
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -20,7 +20,6 @@
 
 Run `pytest tests/test_offline_inference.py`.
 """
-import pytest
 from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
@@ -55,40 +54,6 @@ def test_multimodal_vl(prompt_template):
             assert output_str, "Generated output should not be empty."
 
 
-@pytest.mark.skip(reason="This e2e test will stuck in multi-batch scenario. "
-                  "Add this back after fixing the issue.")
-def test_multimodal_ascend_scheduler(prompt_template):
-    image = ImageAsset("cherry_blossom") \
-        .pil_image.convert("RGB")
-    img_questions = [
-        "What is the content of this image?",
-        "Describe the content of this image in detail.",
-        "What's in the image?",
-        "Where is this image taken?",
-    ]
-    images = [image] * len(img_questions)
-    prompts = prompt_template(img_questions)
-    with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
-                    max_model_len=4096,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
-                    mm_processor_kwargs={
-                        "min_pixels": 28 * 28,
-                        "max_pixels": 1280 * 28 * 28,
-                        "fps": 1,
-                    },
-                    enforce_eager=True) as vllm_model:
-        outputs = vllm_model.generate_greedy(prompts=prompts,
-                                             images=images,
-                                             max_tokens=64)
-        assert len(outputs) == len(prompts)
-        for _, output_str in outputs:
-            assert output_str, "Generated output should not be empty."
-
-
 def test_multimodal_audio():
     audio_prompt = "".join([
         f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"