[Scheduler][MTP] Add support for speculative decoding in AsecendScheduler. (#943)

This PR adds support for speculative decoding in AsecendScheduler.
Also inculde part of support for disaggregated prefill, full support
will be merged in follow-up PR.

---------

Signed-off-by: whx-sjtu <2952154980@qq.com>
This commit is contained in:
whx
2025-06-11 20:55:44 +08:00
committed by GitHub
parent 4f5964420e
commit 3393d53b36
5 changed files with 1001 additions and 49 deletions

View File

@@ -180,18 +180,20 @@ jobs:
run: |
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py
pytest -sv tests/singlecard/test_scheduler.py
# guided decoding doesn't work, fix it later
# pytest -sv tests/singlecard/test_guided_decoding.py.py
# test_ascend_config.py should be ran separately because it will regenerate the global config many times.
pytest -sv tests/singlecard/test_ascend_config.py
pytest -sv tests/singlecard/test_camem.py
# pytest -sv tests/singlecard/core/test_ascend_scheduler.py
# pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py
pytest -sv tests/singlecard/ \
--ignore=tests/singlecard/test_offline_inference.py \
--ignore=tests/singlecard/test_scheduler.py \
--ignore=tests/singlecard/test_guided_decoding.py \
--ignore=tests/singlecard/test_ascend_config.py \
--ignore=tests/singlecard/test_camem.py
--ignore=tests/singlecard/test_camem.py \
--ignore=tests/singlecard/core/test_ascend_scheduler.py \
--ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
else
pytest -sv tests/multicard/test_ilama_lora_tp2.py
# To avoid oom, we need to run the test in a single process.
@@ -209,20 +211,21 @@ jobs:
run: |
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py
pytest -sv tests/singlecard/test_scheduler.py
# guided decoding doesn't work, fix it later
# pytest -sv tests/singlecard/test_guided_decoding.py.py
pytest -sv tests/singlecard/test_camem.py
# test_ascend_config.py should be ran separately because it will regenerate the global config many times.
pytest -sv tests/singlecard/test_ascend_config.py
pytest -sv tests/singlecard/test_prompt_embedding.py
pytest -sv tests/singlecard/core/test_ascend_scheduler.py
pytest -sv tests/singlecard/ \
--ignore=tests/singlecard/test_offline_inference.py \
--ignore=tests/singlecard/test_scheduler.py \
--ignore=tests/singlecard/test_guided_decoding.py \
--ignore=tests/singlecard/test_camem.py \
--ignore=tests/singlecard/test_ascend_config.py \
--ignore=tests/singlecard/test_prompt_embedding.py
--ignore=tests/singlecard/test_prompt_embedding.py \
--ignore=tests/singlecard/core/test_ascend_scheduler.py \
--ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
else
pytest -sv tests/multicard/test_ilama_lora_tp2.py
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.