[Scheduler][MTP] Add support for speculative decoding in AsecendScheduler. (#943)
This PR adds support for speculative decoding in AsecendScheduler. Also inculde part of support for disaggregated prefill, full support will be merged in follow-up PR. --------- Signed-off-by: whx-sjtu <2952154980@qq.com>
This commit is contained in:
15
.github/workflows/vllm_ascend_test.yaml
vendored
15
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -180,18 +180,20 @@ jobs:
|
||||
run: |
|
||||
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
|
||||
VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py
|
||||
pytest -sv tests/singlecard/test_scheduler.py
|
||||
# guided decoding doesn't work, fix it later
|
||||
# pytest -sv tests/singlecard/test_guided_decoding.py.py
|
||||
# test_ascend_config.py should be ran separately because it will regenerate the global config many times.
|
||||
pytest -sv tests/singlecard/test_ascend_config.py
|
||||
pytest -sv tests/singlecard/test_camem.py
|
||||
# pytest -sv tests/singlecard/core/test_ascend_scheduler.py
|
||||
# pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py
|
||||
pytest -sv tests/singlecard/ \
|
||||
--ignore=tests/singlecard/test_offline_inference.py \
|
||||
--ignore=tests/singlecard/test_scheduler.py \
|
||||
--ignore=tests/singlecard/test_guided_decoding.py \
|
||||
--ignore=tests/singlecard/test_ascend_config.py \
|
||||
--ignore=tests/singlecard/test_camem.py
|
||||
--ignore=tests/singlecard/test_camem.py \
|
||||
--ignore=tests/singlecard/core/test_ascend_scheduler.py \
|
||||
--ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
|
||||
else
|
||||
pytest -sv tests/multicard/test_ilama_lora_tp2.py
|
||||
# To avoid oom, we need to run the test in a single process.
|
||||
@@ -209,20 +211,21 @@ jobs:
|
||||
run: |
|
||||
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
|
||||
VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py
|
||||
pytest -sv tests/singlecard/test_scheduler.py
|
||||
# guided decoding doesn't work, fix it later
|
||||
# pytest -sv tests/singlecard/test_guided_decoding.py.py
|
||||
pytest -sv tests/singlecard/test_camem.py
|
||||
# test_ascend_config.py should be ran separately because it will regenerate the global config many times.
|
||||
pytest -sv tests/singlecard/test_ascend_config.py
|
||||
pytest -sv tests/singlecard/test_prompt_embedding.py
|
||||
pytest -sv tests/singlecard/core/test_ascend_scheduler.py
|
||||
pytest -sv tests/singlecard/ \
|
||||
--ignore=tests/singlecard/test_offline_inference.py \
|
||||
--ignore=tests/singlecard/test_scheduler.py \
|
||||
--ignore=tests/singlecard/test_guided_decoding.py \
|
||||
--ignore=tests/singlecard/test_camem.py \
|
||||
--ignore=tests/singlecard/test_ascend_config.py \
|
||||
--ignore=tests/singlecard/test_prompt_embedding.py
|
||||
--ignore=tests/singlecard/test_prompt_embedding.py \
|
||||
--ignore=tests/singlecard/core/test_ascend_scheduler.py \
|
||||
--ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
|
||||
else
|
||||
pytest -sv tests/multicard/test_ilama_lora_tp2.py
|
||||
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.
|
||||
|
||||
Reference in New Issue
Block a user