[CI] drop ascend scheduler test (#4582)

let' drop ascend scheduler test first to ensure all function works without it. - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-01 20:33:50 +08:00
parent 203b4e6777
commit 27b09ca9b9
28 changed files with 53 additions and 376 deletions
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -94,7 +94,6 @@ jobs:
          pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
          pytest -sv tests/e2e/singlecard/test_bge_model.py
          pytest -sv tests/e2e/singlecard/test_camem.py
-          pytest -sv tests/e2e/singlecard/test_chunked.py
          pytest -sv tests/e2e/singlecard/test_embedding.py
          # pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
          pytest -sv tests/e2e/singlecard/test_guided_decoding.py
--- a/tests/e2e/310p/test_offline_inference_parallel_310p.py
+++ b/tests/e2e/310p/test_offline_inference_parallel_310p.py
@@ -29,9 +29,6 @@ ADDITIONAL_CONFIG = [{
    "additional_config": {
        "torchair_graph_config": {
            "enabled": True
-        },
-        "ascend_scheduler_config": {
-            "enabled": True,
        }
    }
 }]
--- a/tests/e2e/multicard/test_expert_parallel.py
+++ b/tests/e2e/multicard/test_expert_parallel.py
@@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
    max_tokens = 5

    # FIXME: Really strange that chunked prefill might lead to different results, investigate further
-    with VllmRunner(
-            model_name,
-            tensor_parallel_size=2,
-            additional_config={"ascend_scheduler_config": {
-                "enabled": True
-            }},
-            enforce_eager=False) as vllm_model:
+    with VllmRunner(model_name, tensor_parallel_size=2,
+                    enforce_eager=False) as vllm_model:
        tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)

-    with VllmRunner(
-            model_name,
-            tensor_parallel_size=2,
-            enable_expert_parallel=True,
-            additional_config={"ascend_scheduler_config": {
-                "enabled": True
-            }},
-            enforce_eager=False) as vllm_model:
+    with VllmRunner(model_name,
+                    tensor_parallel_size=2,
+                    enable_expert_parallel=True,
+                    enforce_eager=False) as vllm_model:
        ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)

    check_outputs_equal(
--- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py
+++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
@@ -49,13 +49,7 @@ def test_generate_with_allgather():
                    tensor_parallel_size=2,
                    max_model_len=1024,
                    dtype="auto",
-                    enable_expert_parallel=True,
-                    additional_config={
-                        "ascend_scheduler_config": {
-                            "enabled": True,
-                            "chunked_prefill_enabled": False,
-                        },
-                    }) as vllm_model:
+                    enable_expert_parallel=True) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)


@@ -76,11 +70,5 @@ def test_generate_with_alltoall():
                    tensor_parallel_size=2,
                    max_model_len=1024,
                    dtype="auto",
-                    enable_expert_parallel=True,
-                    additional_config={
-                        "ascend_scheduler_config": {
-                            "enabled": True,
-                            "chunked_prefill_enabled": False,
-                        },
-                    }) as vllm_model:
+                    enable_expert_parallel=True) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
                    "enabled": True,
                },
                "enable_multistream_moe": True,
-                "ascend_scheduler_config": {
-                    "enabled": True,
-                },
                "refresh": True,
            },
    ) as vllm_model:
@@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
            quantization="ascend",
            enforce_eager=True,
            enable_expert_parallel=True,
-            additional_config={
-                "torchair_graph_config": {
-                    "enabled": False,
-                },
-                "ascend_scheduler_config": {
-                    "enabled": True,
-                }
-            },
+            additional_config={"torchair_graph_config": {
+                "enabled": False,
+            }},
    ) as vllm_model:
        vllm_model.generate_greedy(prompts, max_tokens)

--- a/tests/e2e/multicard/test_prefix_caching.py
+++ b/tests/e2e/multicard/test_prefix_caching.py
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
        name_0="vllm_output",
        name_1="prefix_cache_output",
    )
-
-
-@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("max_tokens", [50])
-def test_prefix_cache_with_ascend_scheduler(model: str,
-                                            max_tokens: int) -> None:
-
-    with VllmRunner(model,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
-                    enforce_eager=False,
-                    max_model_len=2048,
-                    tensor_parallel_size=2,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
-
-    with VllmRunner(model,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                            'enable_prefix_caching': True,
-                        },
-                    },
-                    enforce_eager=False,
-                    max_model_len=2048,
-                    tensor_parallel_size=2,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        prefix_cache_output = vllm_model.generate_greedy(
-            INPUT_PROMPTS, max_tokens)
-
-    # TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
-    # Disable it now. Fix it or drop the ascend scheduler in the future.
-    # with VllmRunner(model,
-    #                 additional_config={
-    #                     'ascend_scheduler_config': {
-    #                         'enabled': True,
-    #                         'enable_prefix_caching': True,
-    #                         "enable_chunked_prefill": True,
-    #                     },
-    #                 },
-    #                 enforce_eager=True,
-    #                 max_model_len=2048,
-    #                 tensor_parallel_size=2,
-    #                 gpu_memory_utilization=0.7) as vllm_model:
-    #     chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
-    #         INPUT_PROMPTS, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_output,
-        outputs_1_lst=prefix_cache_output,
-        name_0="vllm_output",
-        name_1="prefix_cache_output",
-    )
-
-    # check_outputs_equal(
-    #     outputs_0_lst=chunk_prefill_prefix_cache_output,
-    #     outputs_1_lst=prefix_cache_output,
-    #     name_0="chunk_prefill_prefix_cache_output",
-    #     name_1="prefix_cache_output",
-    # )
--- a/tests/e2e/multicard/test_qwen3_next.py
+++ b/tests/e2e/multicard/test_qwen3_next.py
@@ -89,12 +89,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
                    gpu_memory_utilization=0.8,
                    distributed_executor_backend="mp",
                    enforce_eager=True,
-                    additional_config={
-                        "ascend_scheduler_config": {
-                            "enabled": True,
-                            "enable_chunked_prefill": False
-                        }
-                    },
                    speculative_config={
                        "method": "qwen3_next_mtp",
                        "num_speculative_tokens": 1
--- a/tests/e2e/multicard/test_torchair_graph_mode.py
+++ b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
    kwargs = {}
    if not use_v1_schduler:
        kwargs = {
-            "ascend_scheduler_config": {
-                "enabled": True,
-            },
            "refresh": True,
        }
    additional_config.update(**kwargs)
@@ -121,9 +118,6 @@ def _pangu_torchair_test_fixture(

    # torchair is only work without chunked-prefill now
    kwargs = {
-        "ascend_scheduler_config": {
-            "enabled": True,
-        },
        "refresh": True,
    }
    additional_config.update(**kwargs)
@@ -186,9 +180,6 @@ def _qwen_torchair_test_fixture(
        "torchair_graph_config": {
            "enabled": False,
        },
-        "ascend_scheduler_config": {
-            "enabled": True,
-        },
        "refresh": True,
    }

@@ -245,9 +236,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
    kwargs = {}
    if not use_v1_schduler:
        kwargs = {
-            "ascend_scheduler_config": {
-                "enable": True,
-            },
            "refresh": True,
        }
    additional_config.update(**kwargs)
--- a/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py
@@ -73,11 +73,7 @@ async def test_models(model: str, mode: str) -> None:
        "VLLM_RPC_TIMEOUT": "3600000",
        "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
    }
-    additional_config: dict[str, Any] = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
-    }
+    additional_config: dict[str, Any] = {}
    speculative_config = {
        "num_speculative_tokens": 2,
        "method": "deepseek_mtp"
--- a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py
@@ -74,9 +74,6 @@ async def test_models(model: str) -> None:
        "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
    }
    additional_config = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
        "torchair_graph_config": {
            "enabled": True,
            "enable_multistream_moe": False,
--- a/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py
+++ b/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py
@@ -68,12 +68,7 @@ aisbench_cases75 = [{
 async def test_models(model: str) -> None:
    port = get_open_port()
    env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
-    additional_config = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
-        "enable_weight_nz_layout": True
-    }
+    additional_config = {"enable_weight_nz_layout": True}
    server_args = [
        "--quantization", "ascend", "--reasoning-parser", "qwen3",
        "--tensor-parallel-size", "4", "--port",
--- a/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
+++ b/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
@@ -83,8 +83,7 @@ async def test_models(model: str, tp_size: int) -> None:
        "0.9", "--block-size", "128", "--max-num-seqs", "256",
        "--enforce-eager", "--max-model-len", "35840",
        "--max-num-batched-tokens", "35840", "--additional-config",
-        '{"ascend_scheduler_config":{"enabled":true},"enable_weight_nz_layout":true}',
-        "--compilation-config",
+        '{"enable_weight_nz_layout":true}', "--compilation-config",
        '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
    ]
    with RemoteOpenAIServer(model,
--- a/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
@@ -33,7 +33,6 @@ MODES = [
    "single",
    "aclgraph",
    "aclgraph_mlapo",
-    "no_chunkprefill",
 ]

 prompts = [
@@ -82,9 +81,6 @@ async def test_models(model: str, mode: str) -> None:
        "method": "deepseek_mtp"
    }
    additional_config = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
        "torchair_graph_config": {
            "enabled": True,
            "enable_multistream_moe": False,
@@ -112,10 +108,6 @@ async def test_models(model: str, mode: str) -> None:
    if mode == "aclgraph_mlapo":
        env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
        additional_config["torchair_graph_config"] = {"enabled": False}
-    if mode == "no_chunkprefill":
-        additional_config["ascend_scheduler_config"] = {"enabled": True}
-        i = server_args.index("--max-num-batched-tokens") + 1
-        server_args[i] = "36864"
    server_args.extend(["--additional-config", json.dumps(additional_config)])
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
@@ -134,7 +126,7 @@ async def test_models(model: str, mode: str) -> None:
        choices: list[openai.types.CompletionChoice] = batch.choices
        assert choices[0].text, "empty response"
        print(choices)
-        if mode in ["single", "no_chunkprefill"]:
+        if mode in ["single"]:
            return
        # aisbench test
        run_aisbench_cases(model,
--- a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
+++ b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
@@ -71,9 +71,6 @@ async def test_models(model: str) -> None:
        "cudagraph_mode": "FULL_DECODE_ONLY"
    }
    additional_config: dict[str, Any] = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
        "torchair_graph_config": {
            "enabled": True
        },
--- a/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
+++ b/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
@@ -92,8 +92,7 @@ async def test_models(model: str, tp_size: int, dp_size: int,
        "--gpu-memory-utilization",
        "0.9",
        "--additional-config",
-        '{"ascend_scheduler_config":{"enabled":true},'
-        '"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
+        '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
    ]
    if full_graph:
        server_args += [
--- a/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py
+++ b/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py
@@ -85,9 +85,8 @@ async def test_models(model: str, tp_size: int) -> None:
        str(tp_size), "--port",
        str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
        "40000", "--max-num-seqs", "400", "--trust-remote-code",
-        "--gpu-memory-utilization", "0.8", "--additional-config",
-        '{"ascend_scheduler_config":{"enabled":false}}',
-        "--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
+        "--gpu-memory-utilization", "0.8", "--compilation_config",
+        '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
    ]
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
--- a/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
+++ b/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
@@ -60,11 +60,7 @@ async def test_models(model: str) -> None:
        "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
        "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
    }
-    additional_config: dict[str, Any] = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
-    }
+    additional_config: dict[str, Any] = {}
    compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
    server_args = [
        "--quantization", "ascend", "--async-scheduling",
--- a/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py
+++ b/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py
@@ -63,11 +63,6 @@ async def test_models(model: str, mode: str) -> None:
        "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
        "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
    }
-    additional_config: dict[str, Any] = {
-        "ascend_scheduler_config": {
-            "enabled": False
-        },
-    }
    compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
    server_args = [
        "--quantization", "ascend", "--async-scheduling",
@@ -82,7 +77,6 @@ async def test_models(model: str, mode: str) -> None:
    server_args.extend(
        ["--compilation-config",
         json.dumps(compilation_config)])
-    server_args.extend(["--additional-config", json.dumps(additional_config)])
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
    }
--- a/tests/e2e/nightly/models/test_qwq_32b.py
+++ b/tests/e2e/nightly/models/test_qwq_32b.py
@@ -93,8 +93,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None:
        server_args.remove(
            '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
        )
-        server_args.append("--additional-config")
-        server_args.append('{"ascend_scheduler_config":{"enabled":true}}')
        server_args.append("--enforce-eager")
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
@@ -30,7 +30,7 @@ deployment:
      --quantization ascend
      --gpu-memory-utilization 0.9
      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'

  -
    server_cmd: >
@@ -51,7 +51,7 @@ deployment:
      --quantization ascend
      --gpu-memory-utilization 0.9
      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
 benchmarks:
  acc:
    case_type: accuracy
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
@@ -31,7 +31,7 @@ deployment:
      --gpu-memory-utilization 0.9
      --enforce-eager
      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'

  -
    server_cmd: >
@@ -53,5 +53,5 @@ deployment:
      --gpu-memory-utilization 0.9
      --enforce-eager
      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
 benchmarks:
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml
@@ -50,7 +50,7 @@ deployment:
          "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
          }'
          --additional-config
-          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'

  -
    server_cmd: >
@@ -80,7 +80,7 @@ deployment:
          "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
          }'
          --additional-config
-          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
  -
    server_cmd: >
      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -111,7 +111,7 @@ deployment:
        "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
        }'
        --additional-config
-        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
  -
    server_cmd: >
      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -141,7 +141,7 @@ deployment:
        "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
        }'
        --additional-config
-        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
 benchmarks:
  perf:
    case_type: performance
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
@@ -49,7 +49,7 @@ deployment:
          "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
          }'
          --additional-config
-          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
+          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'

  -
    server_cmd: >
@@ -79,7 +79,7 @@ deployment:
          "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
          }'
          --additional-config
-          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
+          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
  -
    server_cmd: >
      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -110,7 +110,7 @@ deployment:
        "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
        }'
        --additional-config
-        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
+        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
  -
    server_cmd: >
      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -140,7 +140,7 @@ deployment:
        "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
        }'
        --additional-config
-        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
+        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
 benchmarks:
  perf:
    case_type: performance
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml
@@ -29,7 +29,7 @@ deployment:
        --trust-remote-code 
        --no-enable-prefix-caching 
        --gpu-memory-utilization 0.9 
-        --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
+        --additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'

  -
    server_cmd: >
@@ -49,5 +49,5 @@ deployment:
        --trust-remote-code 
        --no-enable-prefix-caching 
        --gpu-memory-utilization 0.92 
-        --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
+        --additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
 benchmarks:
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
@@ -48,27 +48,26 @@ def mtp_correctness(sampling_config: SamplingParams,
    if graph_mode == CUDAGraphMode.FULL:
        graph_mode_str = "FULL_DECODE_ONLY"

-    with VllmRunner(
-            model_name,
-            tensor_parallel_size=1,
-            max_num_seqs=256,
-            gpu_memory_utilization=0.7,
-            distributed_executor_backend="mp",
-            enable_expert_parallel=True,
-            speculative_config={
-                "method": "deepseek_mtp",
-                "num_speculative_tokens": num_speculative_tokens,
-                "disable_padded_drafter_batch": disable_padded_drafter_batch,
-            },
-            enforce_eager=enforce_eager,
-            max_model_len=2000,
-            compilation_config=CompilationConfig(
-                cudagraph_mode=graph_mode_str,
-                cudagraph_capture_sizes=[12],
-            ),
-            additional_config={"ascend_scheduler_config": {
-                "enabled": False
-            }}) as spec_llm:
+    with VllmRunner(model_name,
+                    tensor_parallel_size=1,
+                    max_num_seqs=256,
+                    gpu_memory_utilization=0.7,
+                    distributed_executor_backend="mp",
+                    enable_expert_parallel=True,
+                    speculative_config={
+                        "method":
+                        "deepseek_mtp",
+                        "num_speculative_tokens":
+                        num_speculative_tokens,
+                        "disable_padded_drafter_batch":
+                        disable_padded_drafter_batch,
+                    },
+                    enforce_eager=enforce_eager,
+                    max_model_len=2000,
+                    compilation_config=CompilationConfig(
+                        cudagraph_mode=graph_mode_str,
+                        cudagraph_capture_sizes=[12],
+                    )) as spec_llm:
        spec_outputs = spec_llm.generate(example_prompts, sampling_config)

    matches = 0
--- a/tests/e2e/singlecard/test_ascend_scheduler.py
+++ b/tests/e2e/singlecard/test_ascend_scheduler.py
@@ -12,11 +12,6 @@ MODEL = "Qwen/Qwen3-0.6B"
@pytest.mark.parametrize("enforce_eager", [True, False])
 def test_concurrent_partial_prefill(enforce_eager):
    with VllmRunner(MODEL,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
                    max_num_seqs=3,
                    max_num_batched_tokens=8192,
                    enforce_eager=enforce_eager,
@@ -31,11 +26,6 @@ def test_concurrent_partial_prefill(enforce_eager):
@pytest.mark.parametrize("enforce_eager", [True, False])
 def test_prefix_cache_stats_is_recorded(enforce_eager):
    with VllmRunner(MODEL,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
                    max_num_seqs=3,
                    max_num_batched_tokens=8192,
                    enforce_eager=enforce_eager,
@@ -47,48 +37,6 @@ def test_prefix_cache_stats_is_recorded(enforce_eager):
        assert outputs[0].num_cached_tokens == 128


-@pytest.mark.parametrize("max_tokens",
-                         [4])  # cannot align results when max_tokens > 4
-@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
-def test_chunked_prefill_with_ascend_scheduler(
-        max_tokens: int, chunked_prefill_token_size: int) -> None:
-    example_prompts = [
-        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
-    ]
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-    with VllmRunner(MODEL,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                            'enable_chunked_prefill': True,
-                        },
-                    },
-                    max_num_seqs=max_num_seqs,
-                    max_num_batched_tokens=max_num_batched_tokens,
-                    max_model_len=2048,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        chunked_prefill_output = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    with VllmRunner(MODEL,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
-                    max_model_len=2048,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_output,
-        outputs_1_lst=chunked_prefill_output,
-        name_0="vllm_output",
-        name_1="chunked_prefill_output",
-    )
-
-
@pytest.mark.parametrize("max_tokens",
                         [4])  # cannot align results when max_tokens > 4
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
--- a/tests/e2e/singlecard/test_chunked.py
+++ b/tests/e2e/singlecard/test_chunked.py
@@ -1,82 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""
-Compare the outputs of vLLM with and without aclgraph.
-
-Run `pytest tests/compile/test_aclgraph.py`.
-"""
-import gc
-
-import pytest
-import torch
-from vllm import SamplingParams
-
-from tests.e2e.conftest import VllmRunner
-
-MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("max_tokens", [1])
-def test_models(
-    model: str,
-    max_tokens: int,
-) -> None:
-    prompts = ["The president of the United States is"]
-
-    sampling_params = SamplingParams(
-        max_tokens=max_tokens,
-        temperature=0.0,
-    )
-
-    with VllmRunner(model,
-                    long_prefill_token_threshold=20,
-                    enforce_eager=False) as vllm_model:
-        output1 = vllm_model.generate(prompts, sampling_params)
-
-    with VllmRunner(model,
-                    enforce_eager=False,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True
-                        },
-                    }) as vllm_model:
-        output2 = vllm_model.generate(prompts, sampling_params)
-
-    # Extract the generated token IDs for comparison
-    token_ids1 = output1[0][0][0]
-    token_ids2 = output2[0][0][0]
-
-    print(f"Token IDs 1: {token_ids1}")
-    print(f"Token IDs 2: {token_ids2}")
-
-    # Convert token IDs to tensors and calculate cosine similarity
-    # Take the length of a shorter sequence to ensure consistent dimensions
-    min_len = min(len(token_ids1), len(token_ids2))
-
-    tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32)
-    tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32)
-
-    # Calculate similarity using torch.cosine_similarity
-    similarity = torch.cosine_similarity(tensor1, tensor2, dim=0)
-    print(f"Token IDs cosine similarity: {similarity.item()}")
-
-    assert similarity > 0.95
-
-    gc.collect()
-    torch.npu.empty_cache()
-    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -20,7 +20,6 @@

 Run `pytest tests/test_offline_inference.py`.
 """
-import pytest
 from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
@@ -55,40 +54,6 @@ def test_multimodal_vl(prompt_template):
            assert output_str, "Generated output should not be empty."


-@pytest.mark.skip(reason="This e2e test will stuck in multi-batch scenario. "
-                  "Add this back after fixing the issue.")
-def test_multimodal_ascend_scheduler(prompt_template):
-    image = ImageAsset("cherry_blossom") \
-        .pil_image.convert("RGB")
-    img_questions = [
-        "What is the content of this image?",
-        "Describe the content of this image in detail.",
-        "What's in the image?",
-        "Where is this image taken?",
-    ]
-    images = [image] * len(img_questions)
-    prompts = prompt_template(img_questions)
-    with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
-                    max_model_len=4096,
-                    additional_config={
-                        'ascend_scheduler_config': {
-                            'enabled': True,
-                        },
-                    },
-                    mm_processor_kwargs={
-                        "min_pixels": 28 * 28,
-                        "max_pixels": 1280 * 28 * 28,
-                        "fps": 1,
-                    },
-                    enforce_eager=True) as vllm_model:
-        outputs = vllm_model.generate_greedy(prompts=prompts,
-                                             images=images,
-                                             max_tokens=64)
-        assert len(outputs) == len(prompts)
-        for _, output_str in outputs:
-            assert output_str, "Generated output should not be empty."
-
-
 def test_multimodal_audio():
    audio_prompt = "".join([
        f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"