diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 832ec65..ea3d673 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -47,9 +47,30 @@ defaults: shell: bash -el {0} jobs: + dispatch: + name: vLLM Ascend test (dispatch) + runs-on: ascend-ci-arm64 + outputs: + number: ${{ steps.dispatch-device.outputs.number }} + steps: + - name: vLLM Ascend test (dispatch) + id: dispatch-device + run: | + # Try to acquire lock to dispatch devices + lockfile /tmp/dispatch.lock + + # Print npu info + npu-list /dev/null 2>&1 + + # Select first available device (Skip reserved davinci1 and davinci0) + NUMBER=$(npu-list /dev/null 2>&1 | grep None | grep -v davinci1 | grep -v davinci0 | head -1 | cut -b 15) + echo "Dispatch to /dev/davinci$NUMBER" + echo "number=$NUMBER" >> $GITHUB_OUTPUT + test: + needs: [dispatch] name: vLLM Ascend test (self-host) - runs-on: ascend-arm64 # actionlint-ignore: runner-label + runs-on: ascend-ci-arm64 # actionlint-ignore: runner-label container: image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10 @@ -58,9 +79,11 @@ jobs: - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi - /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ # Use self-host cache speed up pip and model download - - /home/action/actions-runner/_work/cache:/github/home/.cache/ + - /home/action/cache:/github/home/.cache/ + # for dispatch lock + - /tmp/:/tmp/ options: >- - --device /dev/davinci6 + --device /dev/davinci${{ needs.dispatch.outputs.number }} --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc @@ -71,6 +94,8 @@ jobs: run: | npu-smi info cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + # unlock + rm -rf /tmp/dispatch.lock - name: Config mirrors run: | diff --git a/tests/test_offline_inference.py b/tests/test_offline_inference.py index 484bce6..6ad5c96 100644 --- a/tests/test_offline_inference.py +++ b/tests/test_offline_inference.py @@ -32,6 +32,7 @@ MODELS = [ "Qwen/Qwen2.5-0.5B-Instruct", ] os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")