Revert "drop ascend scheduler" (#4580)

Reverts vllm-project/vllm-ascend#4498 - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2
2025-11-29 22:20:48 +08:00
parent 4dbe4fd123
commit 517fd9272d
52 changed files with 2948 additions and 85 deletions
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
@@ -48,26 +48,27 @@ def mtp_correctness(sampling_config: SamplingParams,
    if graph_mode == CUDAGraphMode.FULL:
        graph_mode_str = "FULL_DECODE_ONLY"

-    with VllmRunner(model_name,
-                    tensor_parallel_size=1,
-                    max_num_seqs=256,
-                    gpu_memory_utilization=0.7,
-                    distributed_executor_backend="mp",
-                    enable_expert_parallel=True,
-                    speculative_config={
-                        "method":
-                        "deepseek_mtp",
-                        "num_speculative_tokens":
-                        num_speculative_tokens,
-                        "disable_padded_drafter_batch":
-                        disable_padded_drafter_batch,
-                    },
-                    enforce_eager=enforce_eager,
-                    max_model_len=2000,
-                    compilation_config=CompilationConfig(
-                        cudagraph_mode=graph_mode_str,
-                        cudagraph_capture_sizes=[12],
-                    )) as spec_llm:
+    with VllmRunner(
+            model_name,
+            tensor_parallel_size=1,
+            max_num_seqs=256,
+            gpu_memory_utilization=0.7,
+            distributed_executor_backend="mp",
+            enable_expert_parallel=True,
+            speculative_config={
+                "method": "deepseek_mtp",
+                "num_speculative_tokens": num_speculative_tokens,
+                "disable_padded_drafter_batch": disable_padded_drafter_batch,
+            },
+            enforce_eager=enforce_eager,
+            max_model_len=2000,
+            compilation_config=CompilationConfig(
+                cudagraph_mode=graph_mode_str,
+                cudagraph_capture_sizes=[12],
+            ),
+            additional_config={"ascend_scheduler_config": {
+                "enabled": False
+            }}) as spec_llm:
        spec_outputs = spec_llm.generate(example_prompts, sampling_config)

    matches = 0