[CI] drop ascend scheduler test (#4582)

let' drop ascend scheduler test first to ensure all function works without it. - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-01 20:33:50 +08:00
parent 203b4e6777
commit 27b09ca9b9
28 changed files with 53 additions and 376 deletions
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -94,7 +94,6 @@ jobs:
          pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
          pytest -sv tests/e2e/singlecard/test_bge_model.py
          pytest -sv tests/e2e/singlecard/test_camem.py
          pytest -sv tests/e2e/singlecard/test_chunked.py
          pytest -sv tests/e2e/singlecard/test_embedding.py
          # pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
          pytest -sv tests/e2e/singlecard/test_guided_decoding.py
--- a/tests/e2e/310p/test_offline_inference_parallel_310p.py
+++ b/tests/e2e/310p/test_offline_inference_parallel_310p.py
@@ -29,9 +29,6 @@ ADDITIONAL_CONFIG = [{
    "additional_config": {
        "torchair_graph_config": {
            "enabled": True
        },
        "ascend_scheduler_config": {
            "enabled": True,
        }
    }
 }]
--- a/tests/e2e/multicard/test_expert_parallel.py
+++ b/tests/e2e/multicard/test_expert_parallel.py
@@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
    max_tokens = 5
    # FIXME: Really strange that chunked prefill might lead to different results, investigate further
-    with VllmRunner(
+    with VllmRunner(model_name, tensor_parallel_size=2,
-            model_name,
+                    enforce_eager=False) as vllm_model:
            tensor_parallel_size=2,
            additional_config={"ascend_scheduler_config": {
                "enabled": True
            }},
            enforce_eager=False) as vllm_model:
        tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
-    with VllmRunner(
+    with VllmRunner(model_name,
-            model_name,
+                    tensor_parallel_size=2,
-            tensor_parallel_size=2,
+                    enable_expert_parallel=True,
-            enable_expert_parallel=True,
+                    enforce_eager=False) as vllm_model:
            additional_config={"ascend_scheduler_config": {
                "enabled": True
            }},
            enforce_eager=False) as vllm_model:
        ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
    check_outputs_equal(
--- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py
+++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
@@ -49,13 +49,7 @@ def test_generate_with_allgather():
                    tensor_parallel_size=2,
                    max_model_len=1024,
                    dtype="auto",
-                    enable_expert_parallel=True,
+                    enable_expert_parallel=True) as vllm_model:
                    additional_config={
                        "ascend_scheduler_config": {
                            "enabled": True,
                            "chunked_prefill_enabled": False,
                        },
                    }) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)
@@ -76,11 +70,5 @@ def test_generate_with_alltoall():
                    tensor_parallel_size=2,
                    max_model_len=1024,
                    dtype="auto",
-                    enable_expert_parallel=True,
+                    enable_expert_parallel=True) as vllm_model:
                    additional_config={
                        "ascend_scheduler_config": {
                            "enabled": True,
                            "chunked_prefill_enabled": False,
                        },
                    }) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
                    "enabled": True,
                },
                "enable_multistream_moe": True,
                "ascend_scheduler_config": {
                    "enabled": True,
                },
                "refresh": True,
            },
    ) as vllm_model:
@@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
            quantization="ascend",
            enforce_eager=True,
            enable_expert_parallel=True,
-            additional_config={
+            additional_config={"torchair_graph_config": {
-                "torchair_graph_config": {
+                "enabled": False,
-                    "enabled": False,
+            }},
                },
                "ascend_scheduler_config": {
                    "enabled": True,
                }
            },
    ) as vllm_model:
        vllm_model.generate_greedy(prompts, max_tokens)
--- a/tests/e2e/multicard/test_prefix_caching.py
+++ b/tests/e2e/multicard/test_prefix_caching.py
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
        name_0="vllm_output",
        name_1="prefix_cache_output",
    )
@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50])
 def test_prefix_cache_with_ascend_scheduler(model: str,
                                            max_tokens: int) -> None:
    with VllmRunner(model,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                        },
                    },
                    enforce_eager=False,
                    max_model_len=2048,
                    tensor_parallel_size=2,
                    gpu_memory_utilization=0.7) as vllm_model:
        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
    with VllmRunner(model,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                            'enable_prefix_caching': True,
                        },
                    },
                    enforce_eager=False,
                    max_model_len=2048,
                    tensor_parallel_size=2,
                    gpu_memory_utilization=0.7) as vllm_model:
        prefix_cache_output = vllm_model.generate_greedy(
            INPUT_PROMPTS, max_tokens)
    # TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
    # Disable it now. Fix it or drop the ascend scheduler in the future.
    # with VllmRunner(model,
    #                 additional_config={
    #                     'ascend_scheduler_config': {
    #                         'enabled': True,
    #                         'enable_prefix_caching': True,
    #                         "enable_chunked_prefill": True,
    #                     },
    #                 },
    #                 enforce_eager=True,
    #                 max_model_len=2048,
    #                 tensor_parallel_size=2,
    #                 gpu_memory_utilization=0.7) as vllm_model:
    #     chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
    #         INPUT_PROMPTS, max_tokens)
    check_outputs_equal(
        outputs_0_lst=vllm_output,
        outputs_1_lst=prefix_cache_output,
        name_0="vllm_output",
        name_1="prefix_cache_output",
    )
    # check_outputs_equal(
    #     outputs_0_lst=chunk_prefill_prefix_cache_output,
    #     outputs_1_lst=prefix_cache_output,
    #     name_0="chunk_prefill_prefix_cache_output",
    #     name_1="prefix_cache_output",
    # )
--- a/tests/e2e/multicard/test_qwen3_next.py
+++ b/tests/e2e/multicard/test_qwen3_next.py
@@ -89,12 +89,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
                    gpu_memory_utilization=0.8,
                    distributed_executor_backend="mp",
                    enforce_eager=True,
                    additional_config={
                        "ascend_scheduler_config": {
                            "enabled": True,
                            "enable_chunked_prefill": False
                        }
                    },
                    speculative_config={
                        "method": "qwen3_next_mtp",
                        "num_speculative_tokens": 1
--- a/tests/e2e/multicard/test_torchair_graph_mode.py
+++ b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
    kwargs = {}
    if not use_v1_schduler:
        kwargs = {
            "ascend_scheduler_config": {
                "enabled": True,
            },
            "refresh": True,
        }
    additional_config.update(**kwargs)
@@ -121,9 +118,6 @@ def _pangu_torchair_test_fixture(
    # torchair is only work without chunked-prefill now
    kwargs = {
        "ascend_scheduler_config": {
            "enabled": True,
        },
        "refresh": True,
    }
    additional_config.update(**kwargs)
@@ -186,9 +180,6 @@ def _qwen_torchair_test_fixture(
        "torchair_graph_config": {
            "enabled": False,
        },
        "ascend_scheduler_config": {
            "enabled": True,
        },
        "refresh": True,
    }
@@ -245,9 +236,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
    kwargs = {}
    if not use_v1_schduler:
        kwargs = {
            "ascend_scheduler_config": {
                "enable": True,
            },
            "refresh": True,
        }
    additional_config.update(**kwargs)
--- a/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py
@@ -73,11 +73,7 @@ async def test_models(model: str, mode: str) -> None:
        "VLLM_RPC_TIMEOUT": "3600000",
        "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
    }
-    additional_config: dict[str, Any] = {
+    additional_config: dict[str, Any] = {}
        "ascend_scheduler_config": {
            "enabled": False
        },
    }
    speculative_config = {
        "num_speculative_tokens": 2,
        "method": "deepseek_mtp"
--- a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py
@@ -74,9 +74,6 @@ async def test_models(model: str) -> None:
        "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
    }
    additional_config = {
        "ascend_scheduler_config": {
            "enabled": False
        },
        "torchair_graph_config": {
            "enabled": True,
            "enable_multistream_moe": False,
--- a/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py
+++ b/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py
@@ -68,12 +68,7 @@ aisbench_cases75 = [{
 async def test_models(model: str) -> None:
    port = get_open_port()
    env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
-    additional_config = {
+    additional_config = {"enable_weight_nz_layout": True}
        "ascend_scheduler_config": {
            "enabled": False
        },
        "enable_weight_nz_layout": True
    }
    server_args = [
        "--quantization", "ascend", "--reasoning-parser", "qwen3",
        "--tensor-parallel-size", "4", "--port",
--- a/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
+++ b/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
@@ -83,8 +83,7 @@ async def test_models(model: str, tp_size: int) -> None:
        "0.9", "--block-size", "128", "--max-num-seqs", "256",
        "--enforce-eager", "--max-model-len", "35840",
        "--max-num-batched-tokens", "35840", "--additional-config",
-        '{"ascend_scheduler_config":{"enabled":true},"enable_weight_nz_layout":true}',
+        '{"enable_weight_nz_layout":true}', "--compilation-config",
        "--compilation-config",
        '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
    ]
    with RemoteOpenAIServer(model,
--- a/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
@@ -33,7 +33,6 @@ MODES = [
    "single",
    "aclgraph",
    "aclgraph_mlapo",
    "no_chunkprefill",
 ]
 prompts = [
@@ -82,9 +81,6 @@ async def test_models(model: str, mode: str) -> None:
        "method": "deepseek_mtp"
    }
    additional_config = {
        "ascend_scheduler_config": {
            "enabled": False
        },
        "torchair_graph_config": {
            "enabled": True,
            "enable_multistream_moe": False,
@@ -112,10 +108,6 @@ async def test_models(model: str, mode: str) -> None:
    if mode == "aclgraph_mlapo":
        env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
        additional_config["torchair_graph_config"] = {"enabled": False}
    if mode == "no_chunkprefill":
        additional_config["ascend_scheduler_config"] = {"enabled": True}
        i = server_args.index("--max-num-batched-tokens") + 1
        server_args[i] = "36864"
    server_args.extend(["--additional-config", json.dumps(additional_config)])
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
@@ -134,7 +126,7 @@ async def test_models(model: str, mode: str) -> None:
        choices: list[openai.types.CompletionChoice] = batch.choices
        assert choices[0].text, "empty response"
        print(choices)
-        if mode in ["single", "no_chunkprefill"]:
+        if mode in ["single"]:
            return
        # aisbench test
        run_aisbench_cases(model,
--- a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
+++ b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
@@ -71,9 +71,6 @@ async def test_models(model: str) -> None:
        "cudagraph_mode": "FULL_DECODE_ONLY"
    }
    additional_config: dict[str, Any] = {
        "ascend_scheduler_config": {
            "enabled": False
        },
        "torchair_graph_config": {
            "enabled": True
        },
--- a/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
+++ b/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
@@ -92,8 +92,7 @@ async def test_models(model: str, tp_size: int, dp_size: int,
        "--gpu-memory-utilization",
        "0.9",
        "--additional-config",
-        '{"ascend_scheduler_config":{"enabled":true},'
+        '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
        '"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
    ]
    if full_graph:
        server_args += [
--- a/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py
+++ b/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py
@@ -85,9 +85,8 @@ async def test_models(model: str, tp_size: int) -> None:
        str(tp_size), "--port",
        str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
        "40000", "--max-num-seqs", "400", "--trust-remote-code",
-        "--gpu-memory-utilization", "0.8", "--additional-config",
+        "--gpu-memory-utilization", "0.8", "--compilation_config",
-        '{"ascend_scheduler_config":{"enabled":false}}',
+        '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
        "--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
    ]
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
--- a/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
+++ b/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
@@ -60,11 +60,7 @@ async def test_models(model: str) -> None:
        "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
        "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
    }
-    additional_config: dict[str, Any] = {
+    additional_config: dict[str, Any] = {}
        "ascend_scheduler_config": {
            "enabled": False
        },
    }
    compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
    server_args = [
        "--quantization", "ascend", "--async-scheduling",
--- a/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py
+++ b/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py
@@ -63,11 +63,6 @@ async def test_models(model: str, mode: str) -> None:
        "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
        "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
    }
    additional_config: dict[str, Any] = {
        "ascend_scheduler_config": {
            "enabled": False
        },
    }
    compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
    server_args = [
        "--quantization", "ascend", "--async-scheduling",
@@ -82,7 +77,6 @@ async def test_models(model: str, mode: str) -> None:
    server_args.extend(
        ["--compilation-config",
         json.dumps(compilation_config)])
    server_args.extend(["--additional-config", json.dumps(additional_config)])
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
    }
--- a/tests/e2e/nightly/models/test_qwq_32b.py
+++ b/tests/e2e/nightly/models/test_qwq_32b.py
@@ -93,8 +93,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None:
        server_args.remove(
            '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
        )
        server_args.append("--additional-config")
        server_args.append('{"ascend_scheduler_config":{"enabled":true}}')
        server_args.append("--enforce-eager")
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
@@ -30,7 +30,7 @@ deployment:
      --quantization ascend
      --gpu-memory-utilization 0.9
      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
  -
    server_cmd: >
@@ -51,7 +51,7 @@ deployment:
      --quantization ascend
      --gpu-memory-utilization 0.9
      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
 benchmarks:
  acc:
    case_type: accuracy
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
@@ -31,7 +31,7 @@ deployment:
      --gpu-memory-utilization 0.9
      --enforce-eager
      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
  -
    server_cmd: >
@@ -53,5 +53,5 @@ deployment:
      --gpu-memory-utilization 0.9
      --enforce-eager
      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
 benchmarks:
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml
@@ -50,7 +50,7 @@ deployment:
          "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
          }'
          --additional-config
-          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
  -
    server_cmd: >
@@ -80,7 +80,7 @@ deployment:
          "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
          }'
          --additional-config
-          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
  -
    server_cmd: >
      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -111,7 +111,7 @@ deployment:
        "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
        }'
        --additional-config
-        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
  -
    server_cmd: >
      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -141,7 +141,7 @@ deployment:
        "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
        }'
        --additional-config
-        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
 benchmarks:
  perf:
    case_type: performance
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
@@ -49,7 +49,7 @@ deployment:
          "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
          }'
          --additional-config
-          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
+          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
  -
    server_cmd: >
@@ -79,7 +79,7 @@ deployment:
          "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
          }'
          --additional-config
-          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
+          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
  -
    server_cmd: >
      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -110,7 +110,7 @@ deployment:
        "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
        }'
        --additional-config
-        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
+        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
  -
    server_cmd: >
      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -140,7 +140,7 @@ deployment:
        "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
        }'
        --additional-config
-        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
+        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
 benchmarks:
  perf:
    case_type: performance
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml
@@ -29,7 +29,7 @@ deployment:
        --trust-remote-code 
        --no-enable-prefix-caching 
        --gpu-memory-utilization 0.9 
-        --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
+        --additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
  -
    server_cmd: >
@@ -49,5 +49,5 @@ deployment:
        --trust-remote-code 
        --no-enable-prefix-caching 
        --gpu-memory-utilization 0.92 
-        --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
+        --additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
 benchmarks:
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
@@ -48,27 +48,26 @@ def mtp_correctness(sampling_config: SamplingParams,
    if graph_mode == CUDAGraphMode.FULL:
        graph_mode_str = "FULL_DECODE_ONLY"
-    with VllmRunner(
+    with VllmRunner(model_name,
-            model_name,
+                    tensor_parallel_size=1,
-            tensor_parallel_size=1,
+                    max_num_seqs=256,
-            max_num_seqs=256,
+                    gpu_memory_utilization=0.7,
-            gpu_memory_utilization=0.7,
+                    distributed_executor_backend="mp",
-            distributed_executor_backend="mp",
+                    enable_expert_parallel=True,
-            enable_expert_parallel=True,
+                    speculative_config={
-            speculative_config={
+                        "method":
-                "method": "deepseek_mtp",
+                        "deepseek_mtp",
-                "num_speculative_tokens": num_speculative_tokens,
+                        "num_speculative_tokens":
-                "disable_padded_drafter_batch": disable_padded_drafter_batch,
+                        num_speculative_tokens,
-            },
+                        "disable_padded_drafter_batch":
-            enforce_eager=enforce_eager,
+                        disable_padded_drafter_batch,
-            max_model_len=2000,
+                    },
-            compilation_config=CompilationConfig(
+                    enforce_eager=enforce_eager,
-                cudagraph_mode=graph_mode_str,
+                    max_model_len=2000,
-                cudagraph_capture_sizes=[12],
+                    compilation_config=CompilationConfig(
-            ),
+                        cudagraph_mode=graph_mode_str,
-            additional_config={"ascend_scheduler_config": {
+                        cudagraph_capture_sizes=[12],
-                "enabled": False
+                    )) as spec_llm:
            }}) as spec_llm:
        spec_outputs = spec_llm.generate(example_prompts, sampling_config)
    matches = 0
--- a/tests/e2e/singlecard/test_ascend_scheduler.py
+++ b/tests/e2e/singlecard/test_ascend_scheduler.py
@@ -12,11 +12,6 @@ MODEL = "Qwen/Qwen3-0.6B"
@pytest.mark.parametrize("enforce_eager", [True, False])
 def test_concurrent_partial_prefill(enforce_eager):
    with VllmRunner(MODEL,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                        },
                    },
                    max_num_seqs=3,
                    max_num_batched_tokens=8192,
                    enforce_eager=enforce_eager,
@@ -31,11 +26,6 @@ def test_concurrent_partial_prefill(enforce_eager):
@pytest.mark.parametrize("enforce_eager", [True, False])
 def test_prefix_cache_stats_is_recorded(enforce_eager):
    with VllmRunner(MODEL,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                        },
                    },
                    max_num_seqs=3,
                    max_num_batched_tokens=8192,
                    enforce_eager=enforce_eager,
@@ -47,48 +37,6 @@ def test_prefix_cache_stats_is_recorded(enforce_eager):
        assert outputs[0].num_cached_tokens == 128
@pytest.mark.parametrize("max_tokens",
                         [4])  # cannot align results when max_tokens > 4
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
 def test_chunked_prefill_with_ascend_scheduler(
        max_tokens: int, chunked_prefill_token_size: int) -> None:
    example_prompts = [
        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
    ]
    max_num_seqs = chunked_prefill_token_size
    max_num_batched_tokens = chunked_prefill_token_size
    with VllmRunner(MODEL,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                            'enable_chunked_prefill': True,
                        },
                    },
                    max_num_seqs=max_num_seqs,
                    max_num_batched_tokens=max_num_batched_tokens,
                    max_model_len=2048,
                    gpu_memory_utilization=0.7) as vllm_model:
        chunked_prefill_output = vllm_model.generate_greedy(
            example_prompts, max_tokens)
    with VllmRunner(MODEL,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                        },
                    },
                    max_model_len=2048,
                    gpu_memory_utilization=0.7) as vllm_model:
        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
    check_outputs_equal(
        outputs_0_lst=vllm_output,
        outputs_1_lst=chunked_prefill_output,
        name_0="vllm_output",
        name_1="chunked_prefill_output",
    )
@pytest.mark.parametrize("max_tokens",
                         [4])  # cannot align results when max_tokens > 4
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
--- a/tests/e2e/singlecard/test_chunked.py
+++ b/tests/e2e/singlecard/test_chunked.py
@@ -1,82 +0,0 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # Copyright 2023 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 """
 Compare the outputs of vLLM with and without aclgraph.
 Run `pytest tests/compile/test_aclgraph.py`.
 """
 import gc
 import pytest
 import torch
 from vllm import SamplingParams
 from tests.e2e.conftest import VllmRunner
 MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [1])
 def test_models(
    model: str,
    max_tokens: int,
 ) -> None:
    prompts = ["The president of the United States is"]
    sampling_params = SamplingParams(
        max_tokens=max_tokens,
        temperature=0.0,
    )
    with VllmRunner(model,
                    long_prefill_token_threshold=20,
                    enforce_eager=False) as vllm_model:
        output1 = vllm_model.generate(prompts, sampling_params)
    with VllmRunner(model,
                    enforce_eager=False,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True
                        },
                    }) as vllm_model:
        output2 = vllm_model.generate(prompts, sampling_params)
    # Extract the generated token IDs for comparison
    token_ids1 = output1[0][0][0]
    token_ids2 = output2[0][0][0]
    print(f"Token IDs 1: {token_ids1}")
    print(f"Token IDs 2: {token_ids2}")
    # Convert token IDs to tensors and calculate cosine similarity
    # Take the length of a shorter sequence to ensure consistent dimensions
    min_len = min(len(token_ids1), len(token_ids2))
    tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32)
    tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32)
    # Calculate similarity using torch.cosine_similarity
    similarity = torch.cosine_similarity(tensor1, tensor2, dim=0)
    print(f"Token IDs cosine similarity: {similarity.item()}")
    assert similarity > 0.95
    gc.collect()
    torch.npu.empty_cache()
    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -20,7 +20,6 @@
 Run `pytest tests/test_offline_inference.py`.
 """
 import pytest
 from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
@@ -55,40 +54,6 @@ def test_multimodal_vl(prompt_template):
            assert output_str, "Generated output should not be empty."
@pytest.mark.skip(reason="This e2e test will stuck in multi-batch scenario. "
                  "Add this back after fixing the issue.")
 def test_multimodal_ascend_scheduler(prompt_template):
    image = ImageAsset("cherry_blossom") \
        .pil_image.convert("RGB")
    img_questions = [
        "What is the content of this image?",
        "Describe the content of this image in detail.",
        "What's in the image?",
        "Where is this image taken?",
    ]
    images = [image] * len(img_questions)
    prompts = prompt_template(img_questions)
    with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
                    max_model_len=4096,
                    additional_config={
                        'ascend_scheduler_config': {
                            'enabled': True,
                        },
                    },
                    mm_processor_kwargs={
                        "min_pixels": 28 * 28,
                        "max_pixels": 1280 * 28 * 28,
                        "fps": 1,
                    },
                    enforce_eager=True) as vllm_model:
        outputs = vllm_model.generate_greedy(prompts=prompts,
                                             images=images,
                                             max_tokens=64)
        assert len(outputs) == len(prompts)
        for _, output_str in outputs:
            assert output_str, "Generated output should not be empty."
 def test_multimodal_audio():
    audio_prompt = "".join([
        f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"