[SpecDecode][CI] Set default values to fix spec decode and fix multicard CI (#1109)
### What this PR does / why we need it? - Set default values to fix spec decode - To avoid oom, we need to run the test in a single process ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - CI passed, espcecially multicards CI - For spec decode test, long term CI passed Closes: https://github.com/vllm-project/vllm-ascend/pull/1105 --------- Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> Signed-off-by: Yikun Jiang <yikunkero@gmail.com> Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com> Co-authored-by: mengwei805 <mengwei25@huawei.com>
This commit is contained in:
8
.github/workflows/vllm_ascend_test.yaml
vendored
8
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -123,7 +123,11 @@ jobs:
|
|||||||
--ignore=tests/singlecard/test_camem.py
|
--ignore=tests/singlecard/test_camem.py
|
||||||
else
|
else
|
||||||
pytest -sv tests/multicard/test_ilama_lora_tp2.py
|
pytest -sv tests/multicard/test_ilama_lora_tp2.py
|
||||||
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py
|
# To avoid oom, we need to run the test in a single process.
|
||||||
|
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
|
||||||
|
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
|
||||||
|
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
|
||||||
|
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
|
||||||
fi
|
fi
|
||||||
|
|
||||||
- name: Run vllm-project/vllm-ascend test on V0 engine
|
- name: Run vllm-project/vllm-ascend test on V0 engine
|
||||||
@@ -149,7 +153,9 @@ jobs:
|
|||||||
else
|
else
|
||||||
pytest -sv tests/multicard/test_ilama_lora_tp2.py
|
pytest -sv tests/multicard/test_ilama_lora_tp2.py
|
||||||
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.
|
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.
|
||||||
|
# To avoid oom, we need to run the test in a single process.
|
||||||
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
|
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
|
||||||
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
|
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
|
||||||
|
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
|
||||||
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
|
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -56,6 +56,12 @@ def create_worker(
|
|||||||
draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
|
draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
|
||||||
ngram_prompt_lookup_min = (
|
ngram_prompt_lookup_min = (
|
||||||
draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
|
draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
|
||||||
|
|
||||||
|
# TODO(Yizhou): A quick fix, must be refactored ASAP
|
||||||
|
draft_worker_kwargs["vllm_config"].parallel_config.expert_parallel_size = 1
|
||||||
|
draft_worker_kwargs[
|
||||||
|
"vllm_config"].parallel_config.expert_tensor_parallel_size = 1
|
||||||
|
|
||||||
draft_model_config = draft_worker_kwargs["vllm_config"].model_config
|
draft_model_config = draft_worker_kwargs["vllm_config"].model_config
|
||||||
draft_parallel_config: ParallelConfig = draft_worker_kwargs[
|
draft_parallel_config: ParallelConfig = draft_worker_kwargs[
|
||||||
'vllm_config'].parallel_config
|
'vllm_config'].parallel_config
|
||||||
|
|||||||
Reference in New Issue
Block a user