Revert "drop ascend scheduler" (#4580)

Reverts vllm-project/vllm-ascend#4498 - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2
2025-11-29 22:20:48 +08:00
parent 4dbe4fd123
commit 517fd9272d
52 changed files with 2948 additions and 85 deletions
--- a/tests/e2e/310p/test_offline_inference_parallel_310p.py
+++ b/tests/e2e/310p/test_offline_inference_parallel_310p.py
@@ -24,12 +24,15 @@ from tests.e2e.conftest import VllmRunner
 MODELS = [
    "IntervitensInc/pangu-pro-moe-model",
 ]
-# set additional config for torchair graph
+# set additional config for ascend scheduler and torchair graph
 ADDITIONAL_CONFIG = [{
    "additional_config": {
        "torchair_graph_config": {
            "enabled": True
        },
+        "ascend_scheduler_config": {
+            "enabled": True,
+        }
    }
 }]

--- a/tests/e2e/multicard/test_expert_parallel.py
+++ b/tests/e2e/multicard/test_expert_parallel.py
@@ -15,14 +15,23 @@ def test_e2e_ep_correctness(model_name):
    max_tokens = 5

    # FIXME: Really strange that chunked prefill might lead to different results, investigate further
-    with VllmRunner(model_name, tensor_parallel_size=2,
-                    enforce_eager=False) as vllm_model:
+    with VllmRunner(
+            model_name,
+            tensor_parallel_size=2,
+            additional_config={"ascend_scheduler_config": {
+                "enabled": True
+            }},
+            enforce_eager=False) as vllm_model:
        tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)

-    with VllmRunner(model_name,
-                    tensor_parallel_size=2,
-                    enable_expert_parallel=True,
-                    enforce_eager=False) as vllm_model:
+    with VllmRunner(
+            model_name,
+            tensor_parallel_size=2,
+            enable_expert_parallel=True,
+            additional_config={"ascend_scheduler_config": {
+                "enabled": True
+            }},
+            enforce_eager=False) as vllm_model:
        ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)

    check_outputs_equal(
--- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py
+++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
@@ -49,7 +49,13 @@ def test_generate_with_allgather():
                    tensor_parallel_size=2,
                    max_model_len=1024,
                    dtype="auto",
-                    enable_expert_parallel=True) as vllm_model:
+                    enable_expert_parallel=True,
+                    additional_config={
+                        "ascend_scheduler_config": {
+                            "enabled": True,
+                            "chunked_prefill_enabled": False,
+                        },
+                    }) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)


@@ -70,5 +76,11 @@ def test_generate_with_alltoall():
                    tensor_parallel_size=2,
                    max_model_len=1024,
                    dtype="auto",
-                    enable_expert_parallel=True) as vllm_model:
+                    enable_expert_parallel=True,
+                    additional_config={
+                        "ascend_scheduler_config": {
+                            "enabled": True,
+                            "chunked_prefill_enabled": False,
+                        },
+                    }) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -82,6 +82,9 @@ def test_models_distributed_DeepSeek_multistream_moe():
                    "enabled": True,
                },
                "enable_multistream_moe": True,
+                "ascend_scheduler_config": {
+                    "enabled": True,
+                },
                "refresh": True,
            },
    ) as vllm_model:
@@ -151,9 +154,14 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
            quantization="ascend",
            enforce_eager=True,
            enable_expert_parallel=True,
-            additional_config={"torchair_graph_config": {
-                "enabled": False,
-            }},
+            additional_config={
+                "torchair_graph_config": {
+                    "enabled": False,
+                },
+                "ascend_scheduler_config": {
+                    "enabled": True,
+                }
+            },
    ) as vllm_model:
        vllm_model.generate_greedy(prompts, max_tokens)

--- a/tests/e2e/multicard/test_prefix_caching.py
+++ b/tests/e2e/multicard/test_prefix_caching.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the with and without prefix caching on V1 scheduler."""
+"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""

 import pytest

@@ -84,3 +84,67 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
        name_0="vllm_output",
        name_1="prefix_cache_output",
    )
+
+
+@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [50])
+def test_prefix_cache_with_ascend_scheduler(model: str,
+                                            max_tokens: int) -> None:
+
+    with VllmRunner(model,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                        },
+                    },
+                    enforce_eager=False,
+                    max_model_len=2048,
+                    tensor_parallel_size=2,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
+
+    with VllmRunner(model,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                            'enable_prefix_caching': True,
+                        },
+                    },
+                    enforce_eager=False,
+                    max_model_len=2048,
+                    tensor_parallel_size=2,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        prefix_cache_output = vllm_model.generate_greedy(
+            INPUT_PROMPTS, max_tokens)
+
+    # TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
+    # Disable it now. Fix it or drop the ascend scheduler in the future.
+    # with VllmRunner(model,
+    #                 additional_config={
+    #                     'ascend_scheduler_config': {
+    #                         'enabled': True,
+    #                         'enable_prefix_caching': True,
+    #                         "enable_chunked_prefill": True,
+    #                     },
+    #                 },
+    #                 enforce_eager=True,
+    #                 max_model_len=2048,
+    #                 tensor_parallel_size=2,
+    #                 gpu_memory_utilization=0.7) as vllm_model:
+    #     chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
+    #         INPUT_PROMPTS, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_output,
+        outputs_1_lst=prefix_cache_output,
+        name_0="vllm_output",
+        name_1="prefix_cache_output",
+    )
+
+    # check_outputs_equal(
+    #     outputs_0_lst=chunk_prefill_prefix_cache_output,
+    #     outputs_1_lst=prefix_cache_output,
+    #     name_0="chunk_prefill_prefix_cache_output",
+    #     name_1="prefix_cache_output",
+    # )
--- a/tests/e2e/multicard/test_qwen3_next.py
+++ b/tests/e2e/multicard/test_qwen3_next.py
@@ -24,7 +24,6 @@ Run `pytest tests/e2e/multicard/test_qwen3_next.py`.
 import os
 from unittest.mock import patch

-import pytest
 from modelscope import snapshot_download  # type: ignore

 from tests.e2e.conftest import VllmRunner
@@ -64,8 +63,6 @@ def test_models_distributed_Qwen3_NEXT_TP4_FULL_DECODE_ONLY():
        del vllm_model


-@pytest.mark.skip(
-    reason="Qwen3-Next + MTP doesn't work with chunked prefill. Fix Me")
 def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
    example_prompts = [
        "Hello, my name is",
@@ -92,6 +89,12 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
                    gpu_memory_utilization=0.8,
                    distributed_executor_backend="mp",
                    enforce_eager=True,
+                    additional_config={
+                        "ascend_scheduler_config": {
+                            "enabled": True,
+                            "enable_chunked_prefill": False
+                        }
+                    },
                    speculative_config={
                        "method": "qwen3_next_mtp",
                        "num_speculative_tokens": 1
--- a/tests/e2e/multicard/test_torchair_graph_mode.py
+++ b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -44,6 +44,9 @@ def _deepseek_torchair_test_fixture(
    kwargs = {}
    if not use_v1_schduler:
        kwargs = {
+            "ascend_scheduler_config": {
+                "enabled": True,
+            },
            "refresh": True,
        }
    additional_config.update(**kwargs)
@@ -117,6 +120,9 @@ def _pangu_torchair_test_fixture(

    # torchair is only work without chunked-prefill now
    kwargs = {
+        "ascend_scheduler_config": {
+            "enabled": True,
+        },
        "refresh": True,
    }
    additional_config.update(**kwargs)
@@ -179,6 +185,9 @@ def _qwen_torchair_test_fixture(
        "torchair_graph_config": {
            "enabled": False,
        },
+        "ascend_scheduler_config": {
+            "enabled": True,
+        },
        "refresh": True,
    }

@@ -235,6 +244,9 @@ def _deepseek_v2_lite_torchair_test_fixure(
    kwargs = {}
    if not use_v1_schduler:
        kwargs = {
+            "ascend_scheduler_config": {
+                "enable": True,
+            },
            "refresh": True,
        }
    additional_config.update(**kwargs)
--- a/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/features/test_mtpx_deepseek_r1_0528_w8a8.py
@@ -73,7 +73,11 @@ async def test_models(model: str, mode: str) -> None:
        "VLLM_RPC_TIMEOUT": "3600000",
        "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
    }
-    additional_config: dict[str, Any] = {}
+    additional_config: dict[str, Any] = {
+        "ascend_scheduler_config": {
+            "enabled": False
+        },
+    }
    speculative_config = {
        "num_speculative_tokens": 2,
        "method": "deepseek_mtp"
--- a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py
@@ -74,6 +74,9 @@ async def test_models(model: str) -> None:
        "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
    }
    additional_config = {
+        "ascend_scheduler_config": {
+            "enabled": False
+        },
        "torchair_graph_config": {
            "enabled": True,
            "enable_multistream_moe": False,
--- a/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py
+++ b/tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py
@@ -68,7 +68,12 @@ aisbench_cases75 = [{
 async def test_models(model: str) -> None:
    port = get_open_port()
    env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
-    additional_config = {"enable_weight_nz_layout": True}
+    additional_config = {
+        "ascend_scheduler_config": {
+            "enabled": False
+        },
+        "enable_weight_nz_layout": True
+    }
    server_args = [
        "--quantization", "ascend", "--reasoning-parser", "qwen3",
        "--tensor-parallel-size", "4", "--port",
--- a/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
+++ b/tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
@@ -83,7 +83,8 @@ async def test_models(model: str, tp_size: int) -> None:
        "0.9", "--block-size", "128", "--max-num-seqs", "256",
        "--enforce-eager", "--max-model-len", "35840",
        "--max-num-batched-tokens", "35840", "--additional-config",
-        '{"enable_weight_nz_layout":true}', "--compilation-config",
+        '{"ascend_scheduler_config":{"enabled":true},"enable_weight_nz_layout":true}',
+        "--compilation-config",
        '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
    ]
    with RemoteOpenAIServer(model,
--- a/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
+++ b/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
@@ -33,6 +33,7 @@ MODES = [
    "single",
    "aclgraph",
    "aclgraph_mlapo",
+    "no_chunkprefill",
 ]

 prompts = [
@@ -81,6 +82,9 @@ async def test_models(model: str, mode: str) -> None:
        "method": "deepseek_mtp"
    }
    additional_config = {
+        "ascend_scheduler_config": {
+            "enabled": False
+        },
        "torchair_graph_config": {
            "enabled": True,
            "enable_multistream_moe": False,
@@ -108,6 +112,10 @@ async def test_models(model: str, mode: str) -> None:
    if mode == "aclgraph_mlapo":
        env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
        additional_config["torchair_graph_config"] = {"enabled": False}
+    if mode == "no_chunkprefill":
+        additional_config["ascend_scheduler_config"] = {"enabled": True}
+        i = server_args.index("--max-num-batched-tokens") + 1
+        server_args[i] = "36864"
    server_args.extend(["--additional-config", json.dumps(additional_config)])
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
--- a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
+++ b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
@@ -71,6 +71,9 @@ async def test_models(model: str) -> None:
        "cudagraph_mode": "FULL_DECODE_ONLY"
    }
    additional_config: dict[str, Any] = {
+        "ascend_scheduler_config": {
+            "enabled": False
+        },
        "torchair_graph_config": {
            "enabled": True
        },
--- a/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
+++ b/tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
@@ -92,6 +92,7 @@ async def test_models(model: str, tp_size: int, dp_size: int,
        "--gpu-memory-utilization",
        "0.9",
        "--additional-config",
+        '{"ascend_scheduler_config":{"enabled":true},'
        '"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
    ]
    if full_graph:
--- a/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py
+++ b/tests/e2e/nightly/models/test_qwen2_5_vl_32b.py
@@ -85,8 +85,9 @@ async def test_models(model: str, tp_size: int) -> None:
        str(tp_size), "--port",
        str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
        "40000", "--max-num-seqs", "400", "--trust-remote-code",
-        "--gpu-memory-utilization", "0.8", "--compilation_config",
-        '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
+        "--gpu-memory-utilization", "0.8", "--additional-config",
+        '{"ascend_scheduler_config":{"enabled":false}}',
+        "--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
    ]
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
--- a/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
+++ b/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
@@ -60,7 +60,11 @@ async def test_models(model: str) -> None:
        "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
        "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
    }
-    additional_config: dict[str, Any] = {}
+    additional_config: dict[str, Any] = {
+        "ascend_scheduler_config": {
+            "enabled": False
+        },
+    }
    compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
    server_args = [
        "--quantization", "ascend", "--async-scheduling",
--- a/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py
+++ b/tests/e2e/nightly/models/test_qwen3_235b_w8a8.py
@@ -63,6 +63,11 @@ async def test_models(model: str, mode: str) -> None:
        "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
        "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
    }
+    additional_config: dict[str, Any] = {
+        "ascend_scheduler_config": {
+            "enabled": False
+        },
+    }
    compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
    server_args = [
        "--quantization", "ascend", "--async-scheduling",
@@ -77,6 +82,7 @@ async def test_models(model: str, mode: str) -> None:
    server_args.extend(
        ["--compilation-config",
         json.dumps(compilation_config)])
+    server_args.extend(["--additional-config", json.dumps(additional_config)])
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
    }
--- a/tests/e2e/nightly/models/test_qwq_32b.py
+++ b/tests/e2e/nightly/models/test_qwq_32b.py
@@ -93,6 +93,8 @@ async def test_models(model: str, mode: str, tp_size: int) -> None:
        server_args.remove(
            '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
        )
+        server_args.append("--additional-config")
+        server_args.append('{"ascend_scheduler_config":{"enabled":true}}')
        server_args.append("--enforce-eager")
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
@@ -30,7 +30,7 @@ deployment:
      --quantization ascend
      --gpu-memory-utilization 0.9
      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'

  -
    server_cmd: >
@@ -51,7 +51,7 @@ deployment:
      --quantization ascend
      --gpu-memory-utilization 0.9
      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
 benchmarks:
  acc:
    case_type: accuracy
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
@@ -31,7 +31,7 @@ deployment:
      --gpu-memory-utilization 0.9
      --enforce-eager
      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'

  -
    server_cmd: >
@@ -53,5 +53,5 @@ deployment:
      --gpu-memory-utilization 0.9
      --enforce-eager
      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
-      --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+      --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
 benchmarks:
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml
@@ -50,7 +50,7 @@ deployment:
          "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
          }'
          --additional-config
-          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'

  -
    server_cmd: >
@@ -80,7 +80,7 @@ deployment:
          "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
          }'
          --additional-config
-          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
  -
    server_cmd: >
      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -111,7 +111,7 @@ deployment:
        "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
        }'
        --additional-config
-        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
  -
    server_cmd: >
      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -141,7 +141,7 @@ deployment:
        "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
        }'
        --additional-config
-        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
+        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
 benchmarks:
  perf:
    case_type: performance
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
@@ -49,7 +49,7 @@ deployment:
          "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
          }'
          --additional-config
-          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
+          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'

  -
    server_cmd: >
@@ -79,7 +79,7 @@ deployment:
          "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
          }'
          --additional-config
-          '{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
+          '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
  -
    server_cmd: >
      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -110,7 +110,7 @@ deployment:
        "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
        }'
        --additional-config
-        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
+        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
  -
    server_cmd: >
      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
@@ -140,7 +140,7 @@ deployment:
        "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
        }'
        --additional-config
-        '{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
+        '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
 benchmarks:
  perf:
    case_type: performance
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml
@@ -29,7 +29,7 @@ deployment:
        --trust-remote-code 
        --no-enable-prefix-caching 
        --gpu-memory-utilization 0.9 
-        --additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
+        --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'

  -
    server_cmd: >
@@ -49,5 +49,5 @@ deployment:
        --trust-remote-code 
        --no-enable-prefix-caching 
        --gpu-memory-utilization 0.92 
-        --additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
+        --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
 benchmarks:
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
@@ -48,26 +48,27 @@ def mtp_correctness(sampling_config: SamplingParams,
    if graph_mode == CUDAGraphMode.FULL:
        graph_mode_str = "FULL_DECODE_ONLY"

-    with VllmRunner(model_name,
-                    tensor_parallel_size=1,
-                    max_num_seqs=256,
-                    gpu_memory_utilization=0.7,
-                    distributed_executor_backend="mp",
-                    enable_expert_parallel=True,
-                    speculative_config={
-                        "method":
-                        "deepseek_mtp",
-                        "num_speculative_tokens":
-                        num_speculative_tokens,
-                        "disable_padded_drafter_batch":
-                        disable_padded_drafter_batch,
-                    },
-                    enforce_eager=enforce_eager,
-                    max_model_len=2000,
-                    compilation_config=CompilationConfig(
-                        cudagraph_mode=graph_mode_str,
-                        cudagraph_capture_sizes=[12],
-                    )) as spec_llm:
+    with VllmRunner(
+            model_name,
+            tensor_parallel_size=1,
+            max_num_seqs=256,
+            gpu_memory_utilization=0.7,
+            distributed_executor_backend="mp",
+            enable_expert_parallel=True,
+            speculative_config={
+                "method": "deepseek_mtp",
+                "num_speculative_tokens": num_speculative_tokens,
+                "disable_padded_drafter_batch": disable_padded_drafter_batch,
+            },
+            enforce_eager=enforce_eager,
+            max_model_len=2000,
+            compilation_config=CompilationConfig(
+                cudagraph_mode=graph_mode_str,
+                cudagraph_capture_sizes=[12],
+            ),
+            additional_config={"ascend_scheduler_config": {
+                "enabled": False
+            }}) as spec_llm:
        spec_outputs = spec_llm.generate(example_prompts, sampling_config)

    matches = 0
--- a/tests/e2e/singlecard/test_ascend_scheduler.py
+++ b/tests/e2e/singlecard/test_ascend_scheduler.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+from vllm import SamplingParams
+
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
+
+MODEL = "Qwen/Qwen3-0.6B"
+
+
+@pytest.mark.parametrize("enforce_eager", [True, False])
+def test_concurrent_partial_prefill(enforce_eager):
+    with VllmRunner(MODEL,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                        },
+                    },
+                    max_num_seqs=3,
+                    max_num_batched_tokens=8192,
+                    enforce_eager=enforce_eager,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
+                                            3)
+        assert len(outputs) == 3
+        for output in outputs:
+            assert len(output.outputs) == 1
+
+
+@pytest.mark.parametrize("enforce_eager", [True, False])
+def test_prefix_cache_stats_is_recorded(enforce_eager):
+    with VllmRunner(MODEL,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                        },
+                    },
+                    max_num_seqs=3,
+                    max_num_batched_tokens=8192,
+                    enforce_eager=enforce_eager,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        # 17 tokens will make sure first 16 tokens are cached in a block
+        input_tokens = {"prompt_token_ids": [101] * 129}
+        _ = vllm_model.model.generate([input_tokens])
+        outputs = vllm_model.model.generate([input_tokens])
+        assert outputs[0].num_cached_tokens == 128
+
+
+@pytest.mark.parametrize("max_tokens",
+                         [4])  # cannot align results when max_tokens > 4
+@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
+def test_chunked_prefill_with_ascend_scheduler(
+        max_tokens: int, chunked_prefill_token_size: int) -> None:
+    example_prompts = [
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
+    ]
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+    with VllmRunner(MODEL,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                            'enable_chunked_prefill': True,
+                        },
+                    },
+                    max_num_seqs=max_num_seqs,
+                    max_num_batched_tokens=max_num_batched_tokens,
+                    max_model_len=2048,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        chunked_prefill_output = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with VllmRunner(MODEL,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                        },
+                    },
+                    max_model_len=2048,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_output,
+        outputs_1_lst=chunked_prefill_output,
+        name_0="vllm_output",
+        name_1="chunked_prefill_output",
+    )
+
+
+@pytest.mark.parametrize("max_tokens",
+                         [4])  # cannot align results when max_tokens > 4
+@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
+def test_chunked_prefill_with_scheduler_dynamic_batch(
+        max_tokens: int, chunked_prefill_token_size: int) -> None:
+    example_prompts = [
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
+    ]
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+    with VllmRunner(MODEL,
+                    additional_config={
+                        'SLO_limits_for_dynamic_batch': 0,
+                    },
+                    max_num_seqs=max_num_seqs,
+                    max_num_batched_tokens=max_num_batched_tokens,
+                    max_model_len=2048,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        dynamic_batch_output = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with VllmRunner(MODEL,
+                    additional_config={
+                        'SLO_limits_for_dynamic_batch': -1,
+                    },
+                    max_model_len=2048,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_output,
+        outputs_1_lst=dynamic_batch_output,
+        name_0="vllm_output",
+        name_1="chunked_prefill_output",
+    )
+
+
+def test_async_scheduling_eager() -> None:
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ] * 10
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=10,
+                                     stop_token_ids=None)
+
+    with VllmRunner(
+            "Qwen/Qwen2.5-0.5B-Instruct",
+            max_model_len=4096,
+            max_num_seqs=50,
+            dtype="bfloat16",
+            gpu_memory_utilization=0.9,
+            async_scheduling=True,
+    ) as vllm_model:
+        vllm_model.generate(prompts, sampling_params=sampling_params)
+
+
+def test_async_scheduling_with_full_graph() -> None:
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ] * 10
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=10,
+                                     stop_token_ids=None)
+
+    with VllmRunner("Qwen/Qwen3-8B",
+                    max_model_len=4096,
+                    max_num_seqs=50,
+                    dtype="bfloat16",
+                    gpu_memory_utilization=0.9,
+                    async_scheduling=True,
+                    compilation_config={"cudagraph_mode":
+                                        "FULL"}) as vllm_model:
+        vllm_model.generate(prompts, sampling_params=sampling_params)
--- a/tests/e2e/singlecard/test_chunked.py
+++ b/tests/e2e/singlecard/test_chunked.py
@@ -0,0 +1,82 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Compare the outputs of vLLM with and without aclgraph.
+
+Run `pytest tests/compile/test_aclgraph.py`.
+"""
+import gc
+
+import pytest
+import torch
+from vllm import SamplingParams
+
+from tests.e2e.conftest import VllmRunner
+
+MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [1])
+def test_models(
+    model: str,
+    max_tokens: int,
+) -> None:
+    prompts = ["The president of the United States is"]
+
+    sampling_params = SamplingParams(
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+
+    with VllmRunner(model,
+                    long_prefill_token_threshold=20,
+                    enforce_eager=False) as vllm_model:
+        output1 = vllm_model.generate(prompts, sampling_params)
+
+    with VllmRunner(model,
+                    enforce_eager=False,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True
+                        },
+                    }) as vllm_model:
+        output2 = vllm_model.generate(prompts, sampling_params)
+
+    # Extract the generated token IDs for comparison
+    token_ids1 = output1[0][0][0]
+    token_ids2 = output2[0][0][0]
+
+    print(f"Token IDs 1: {token_ids1}")
+    print(f"Token IDs 2: {token_ids2}")
+
+    # Convert token IDs to tensors and calculate cosine similarity
+    # Take the length of a shorter sequence to ensure consistent dimensions
+    min_len = min(len(token_ids1), len(token_ids2))
+
+    tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32)
+    tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32)
+
+    # Calculate similarity using torch.cosine_similarity
+    similarity = torch.cosine_similarity(tensor1, tensor2, dim=0)
+    print(f"Token IDs cosine similarity: {similarity.item()}")
+
+    assert similarity > 0.95
+
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -20,6 +20,7 @@

 Run `pytest tests/test_offline_inference.py`.
 """
+import pytest
 from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
@@ -54,6 +55,40 @@ def test_multimodal_vl(prompt_template):
            assert output_str, "Generated output should not be empty."


+@pytest.mark.skip(reason="This e2e test will stuck in multi-batch scenario. "
+                  "Add this back after fixing the issue.")
+def test_multimodal_ascend_scheduler(prompt_template):
+    image = ImageAsset("cherry_blossom") \
+        .pil_image.convert("RGB")
+    img_questions = [
+        "What is the content of this image?",
+        "Describe the content of this image in detail.",
+        "What's in the image?",
+        "Where is this image taken?",
+    ]
+    images = [image] * len(img_questions)
+    prompts = prompt_template(img_questions)
+    with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
+                    max_model_len=4096,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                        },
+                    },
+                    mm_processor_kwargs={
+                        "min_pixels": 28 * 28,
+                        "max_pixels": 1280 * 28 * 28,
+                        "fps": 1,
+                    },
+                    enforce_eager=True) as vllm_model:
+        outputs = vllm_model.generate_greedy(prompts=prompts,
+                                             images=images,
+                                             max_tokens=64)
+        assert len(outputs) == len(prompts)
+        for _, output_str in outputs:
+            assert output_str, "Generated output should not be empty."
+
+
 def test_multimodal_audio():
    audio_prompt = "".join([
        f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
--- a/tests/ut/core/test_schedule_config.py
+++ b/tests/ut/core/test_schedule_config.py
@@ -0,0 +1,134 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from vllm.config import SchedulerConfig
+
+from tests.ut.base import TestBase
+from vllm_ascend.core.schedule_config import AscendSchedulerConfig
+
+
+class TestAscendSchedulerConfig(TestBase):
+
+    def setUp(self):
+        self.basic_scheduler_config = SchedulerConfig(
+            max_num_batched_tokens=8192,
+            max_model_len=8192,
+            is_multimodal_model=False,
+            send_delta_data=False,
+        )
+
+    def test_initialize_from_config_with_default(self):
+        # No additional config given, check the default value here.
+        ascend_config = AscendSchedulerConfig.initialize_from_config(
+            self.basic_scheduler_config, {})
+        self.assertEqual(ascend_config.enable_chunked_prefill, False)
+        self.assertEqual(ascend_config.policy, "fcfs")
+        self.assertEqual(ascend_config.scheduler_cls,
+                         "vllm_ascend.core.scheduler.AscendScheduler")
+        self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192)
+        self.assertEqual(ascend_config.encoder_cache_size, 8192)
+
+    def test_initialize_from_config_with_override(self):
+        # test override
+        ascend_config = AscendSchedulerConfig.initialize_from_config(
+            self.basic_scheduler_config,
+            AscendSchedulerConfig(
+                enable_chunked_prefill=False,
+                policy="fcfs",
+                scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler",
+                max_num_batched_tokens=8192,
+                max_model_len=2048,
+                max_long_partial_prefills=1,
+                long_prefill_token_threshold=512,
+            ),
+        )
+        self.assertEqual(ascend_config.enable_chunked_prefill, False)
+        self.assertEqual(ascend_config.policy, "fcfs")
+        self.assertEqual(ascend_config.scheduler_cls,
+                         "vllm_ascend.core.scheduler.AscendScheduler")
+        self.assertEqual(ascend_config.max_num_batched_tokens, 8192)
+        self.assertEqual(ascend_config.encoder_cache_size, 8192)
+        self.assertEqual(ascend_config.max_long_partial_prefills, 1)
+        self.assertEqual(ascend_config.long_prefill_token_threshold, 512)
+
+    def test_not_implemented_policy(self):
+        with self.assertRaises(NotImplementedError) as context:
+            AscendSchedulerConfig.initialize_from_config(
+                self.basic_scheduler_config,
+                AscendSchedulerConfig(
+                    policy="custom_policy",
+                    max_num_batched_tokens=8192,
+                    max_model_len=2048,
+                ),
+            )
+        self.assertIn(
+            "currently AscendScheduler only supports fcfs policy",
+            str(context.exception),
+        )
+
+    def test_no_override(self):
+        ascend_config = AscendSchedulerConfig.initialize_from_config(
+            self.basic_scheduler_config, {})
+        self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192)
+        self.assertEqual(ascend_config.encoder_cache_size, 8192)
+
+    def test_valid_config_with_multimodal(self):
+        config = AscendSchedulerConfig.initialize_from_config(
+            SchedulerConfig(is_multimodal_model=True,
+                            max_num_batched_tokens=8192), {})
+        self.assertTrue(config.is_multimodal_model)
+
+    def test_valid_config_with_chunked_prefill(self):
+        ascend_config = AscendSchedulerConfig.initialize_from_config(
+            self.basic_scheduler_config,
+            AscendSchedulerConfig(
+                enable_chunked_prefill=True,
+                max_num_batched_tokens=8192,
+                max_model_len=8192,
+            ),
+        )
+        self.assertEqual(ascend_config.max_num_batched_tokens, 8192)
+        self.assertEqual(ascend_config.max_model_len, 8192)
+        self.assertTrue(ascend_config.enable_chunked_prefill)
+
+    def test_invalid_config_without_chunked_prefill(self):
+        with self.assertRaises(ValueError) as context:
+            AscendSchedulerConfig.initialize_from_config(
+                self.basic_scheduler_config,
+                AscendSchedulerConfig(
+                    enable_chunked_prefill=False,
+                    max_num_batched_tokens=2048,
+                    max_model_len=8192,
+                ),
+            )
+        self.assertIn(
+            "Ascend scheduler is enabled without chunked prefill feature",
+            str(context.exception),
+        )
+        self.assertIn("max_num_batched_tokens (2048)", str(context.exception))
+        self.assertIn("max_model_len (8192)", str(context.exception))
+
+    def test_initialize_from_config_with_pd_transfer(self):
+        ascend_config = AscendSchedulerConfig.initialize_from_config(
+            self.basic_scheduler_config,
+            AscendSchedulerConfig(
+                enable_pd_transfer=True,
+                decode_max_num_seqs=48,
+                max_num_batched_tokens=8192,
+                max_model_len=4096,
+            ),
+        )
+        self.assertEqual(ascend_config.enable_pd_transfer, True)
+        self.assertEqual(ascend_config.decode_max_num_seqs, 48)
--- a/tests/ut/core/test_scheduler.py
+++ b/tests/ut/core/test_scheduler.py
--- a/tests/ut/ops/test_linear.py
+++ b/tests/ut/ops/test_linear.py
@@ -99,6 +99,7 @@ class TestAscendRowParallelLinear(BaseLinearTest):

        ascend_config._ASCEND_CONFIG = MagicMock()
        ascend_config._ASCEND_CONFIG.oproj_tensor_parallel_size = 2
+        ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False

        linear = AscendRowParallelLinear(
            input_size=16,
--- a/tests/ut/ops/test_vocab_parallel_embedding.py
+++ b/tests/ut/ops/test_vocab_parallel_embedding.py
@@ -209,7 +209,12 @@ class TestAscendLogitsProcessor(unittest.TestCase):
                return_value=torch.randn(1, self.vocab_size)),
            patch(
                "vllm_ascend.ops.vocab_parallel_embedding.get_lmhead_tp_group.all_gather",
-                return_value=torch.randn(1, self.vocab_size))
+                return_value=torch.randn(1, self.vocab_size)),
+            patch(
+                "vllm_ascend.core.schedule_config.AscendSchedulerConfig.initialize_from_config",
+                return_value=MagicMock(max_num_batched_tokens=1000,
+                                       max_model_len=512,
+                                       enable_chunked_prefill=False))
        ]

        for p in self.patches:
--- a/tests/ut/quantization/test_w8a8_dynamic.py
+++ b/tests/ut/quantization/test_w8a8_dynamic.py
@@ -33,6 +33,13 @@ class TestAscendW8A8FusedMoEMethod(TestBase):
            mock_get_ep_group.return_value = mock_ep_group
            mock_ascend_config = Mock()

+            # 创建一个具有具体属性的 Mock 对象来表示 ascend_scheduler_config
+            mock_ascend_scheduler_config = Mock()
+            mock_ascend_scheduler_config.enabled = False
+            mock_ascend_scheduler_config.max_num_batched_tokens = 1024
+            mock_ascend_scheduler_config.max_model_len = 2048
+            mock_ascend_config.ascend_scheduler_config = mock_ascend_scheduler_config
+
            mock_ascend_config.torchair_graph_config = Mock(enabled=False)
            mock_ascend_config.enable_chunked_prefill = False
            mock_get_ascend_config.return_value = mock_ascend_config
--- a/tests/ut/test_ascend_config.py
+++ b/tests/ut/test_ascend_config.py
@@ -56,6 +56,9 @@ class TestAscendConfig(TestBase):
        self.assertTrue(torchair_graph_config.enable_frozen_parameter)
        self.assertFalse(torchair_graph_config.enable_kv_nz)

+        ascend_scheduler_config = ascend_config.ascend_scheduler_config
+        self.assertFalse(ascend_scheduler_config.enabled)
+
    @_clean_up_ascend_config
    def test_init_ascend_config_with_additional_config(self):
        test_vllm_config = VllmConfig()
@@ -71,6 +74,9 @@ class TestAscendConfig(TestBase):
                "enable_kv_nz": True
            },
            "multistream_overlap_shared_expert": True,
+            "ascend_scheduler_config": {
+                "enabled": True
+            },
            "expert_map_path": "test_expert_map_path",
            "refresh": True,
        }
@@ -88,6 +94,9 @@ class TestAscendConfig(TestBase):
        self.assertTrue(torchair_graph_config.enable_frozen_parameter)
        self.assertTrue(torchair_graph_config.enable_kv_nz)

+        ascend_scheduler_config = ascend_config.ascend_scheduler_config
+        self.assertTrue(ascend_scheduler_config.enabled)
+
    @_clean_up_ascend_config
    def test_init_ascend_config_with_refresh(self):
        test_vllm_config = VllmConfig()
--- a/tests/ut/test_platform.py
+++ b/tests/ut/test_platform.py
@@ -32,6 +32,7 @@ class TestNPUPlatform(TestBase):
    def mock_vllm_ascend_config():
        mock_ascend_config = MagicMock()
        mock_ascend_config.torchair_graph_config.enabled = False
+        mock_ascend_config.ascend_scheduler_config.enabled = False
        mock_ascend_config.enable_shared_expert_dp = False
        return mock_ascend_config

@@ -521,6 +522,31 @@ class TestNPUPlatform(TestBase):
        self.platform.check_and_update_config(vllm_config)
        self.assertEqual(vllm_config.compilation_config.custom_ops, [])

+    @patch('vllm_ascend.utils.get_ascend_device_type',
+           return_value=AscendDeviceType._910_93)
+    @patch("vllm_ascend.ascend_config.check_ascend_config")
+    @patch("vllm_ascend.ascend_config.init_ascend_config")
+    @patch(
+        "vllm_ascend.core.recompute_schedule_config.RecomputeSchedulerConfig.initialize_from_config"
+    )
+    def test_check_and_update_config_ascend_scheduler_config(
+            self, mock_init_recompute, mock_init_ascend, mock_check_ascend,
+            mock_soc_version):
+        mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config()
+        mock_ascend_config.ascend_scheduler_config.enabled = True
+        mock_init_ascend.return_value = mock_ascend_config
+        vllm_config = TestNPUPlatform.mock_vllm_config()
+        vllm_config.parallel_config.tensor_parallel_size = 1
+        mock_init_recompute.return_value = MagicMock()
+
+        with patch("vllm_ascend.core.schedule_config.AscendSchedulerConfig"
+                   ) as mock_scheduler:
+            from vllm_ascend import platform
+
+            importlib.reload(platform)
+            self.platform.check_and_update_config(vllm_config)
+            mock_scheduler.initialize_from_config.assert_called_once()
+
    @patch('vllm_ascend.platform.get_ascend_config')
    def test_get_attn_backend_cls_use_v1_and_mla(self, mock_get_ascend_config):
        mock_config = MagicMock()
--- a/tests/ut/test_utils.py
+++ b/tests/ut/test_utils.py
@@ -253,10 +253,12 @@ class TestUtils(TestBase):
        model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
        test_model_config = ModelConfig(model=model_path, enforce_eager=True)
        test_parallel_config = ParallelConfig()
+        ascend_config = {"ascend_scheduler_config": {"enabled": False}}
        test_vllm_config = VllmConfig(
            model_config=test_model_config,
            compilation_config=test_compilation_config,
-            parallel_config=test_parallel_config)
+            parallel_config=test_parallel_config,
+            additional_config=ascend_config)
        utils.update_aclgraph_sizes(test_vllm_config)
        os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
        utils.update_aclgraph_sizes(test_vllm_config)
--- a/tests/ut/torchair/models/test_torchair_deepseek_v2.py
+++ b/tests/ut/torchair/models/test_torchair_deepseek_v2.py
@@ -235,6 +235,8 @@ def test_torchair_deepseek_v2_mlp(mock_distributed, base_config):
                                hidden_act="silu",
                                quant_config=None)
    assert isinstance(mlp.act_fn, TorchairDeepseekV2SiluAndMul)
+    ascend_config = MagicMock()
+    ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False
    with patch(
            "vllm_ascend.torchair.models.torchair_deepseek_v2.QuantizationConfig"
    ) as mock_quant_config: