[Scheduler][MTP] Add support for speculative decoding in AsecendScheduler. (#943)

This PR adds support for speculative decoding in AsecendScheduler. Also inculde part of support for disaggregated prefill, full support will be merged in follow-up PR. --------- Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-06-11 20:55:44 +08:00
parent 4f5964420e
commit 3393d53b36
5 changed files with 1001 additions and 49 deletions
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -180,18 +180,20 @@ jobs:
        run: |
          if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
            VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py
-            pytest -sv tests/singlecard/test_scheduler.py
            # guided decoding doesn't work, fix it later
            # pytest -sv tests/singlecard/test_guided_decoding.py.py
            # test_ascend_config.py should be ran separately because it will regenerate the global config many times.
            pytest -sv tests/singlecard/test_ascend_config.py
            pytest -sv tests/singlecard/test_camem.py
+            # pytest -sv tests/singlecard/core/test_ascend_scheduler.py
+            # pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py
            pytest -sv tests/singlecard/ \
            --ignore=tests/singlecard/test_offline_inference.py \
-            --ignore=tests/singlecard/test_scheduler.py \
            --ignore=tests/singlecard/test_guided_decoding.py \
            --ignore=tests/singlecard/test_ascend_config.py \
-            --ignore=tests/singlecard/test_camem.py
+            --ignore=tests/singlecard/test_camem.py \
+            --ignore=tests/singlecard/core/test_ascend_scheduler.py \
+            --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
          else
            pytest -sv tests/multicard/test_ilama_lora_tp2.py
            # To avoid oom, we need to run the test in a single process.
@@ -209,20 +211,21 @@ jobs:
        run: |
          if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
            VLLM_USE_MODELSCOPE=True  pytest -sv tests/singlecard/test_offline_inference.py
-            pytest -sv tests/singlecard/test_scheduler.py
            # guided decoding doesn't work, fix it later
            # pytest -sv tests/singlecard/test_guided_decoding.py.py
            pytest -sv tests/singlecard/test_camem.py
            # test_ascend_config.py should be ran separately because it will regenerate the global config many times.
            pytest -sv tests/singlecard/test_ascend_config.py
            pytest -sv tests/singlecard/test_prompt_embedding.py
+            pytest -sv tests/singlecard/core/test_ascend_scheduler.py
            pytest -sv tests/singlecard/ \
              --ignore=tests/singlecard/test_offline_inference.py \
-              --ignore=tests/singlecard/test_scheduler.py \
              --ignore=tests/singlecard/test_guided_decoding.py \
              --ignore=tests/singlecard/test_camem.py \
              --ignore=tests/singlecard/test_ascend_config.py \
-              --ignore=tests/singlecard/test_prompt_embedding.py
+              --ignore=tests/singlecard/test_prompt_embedding.py \
+              --ignore=tests/singlecard/core/test_ascend_scheduler.py \
+              --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
          else
            pytest -sv tests/multicard/test_ilama_lora_tp2.py
            # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.