From 9c7428b3d5b63939c15ae713edc3871e51b98cbc Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Sat, 12 Apr 2025 10:24:53 +0800 Subject: [PATCH] [CI] enable custom ops build (#466) ### What this PR does / why we need it? This PR enable custom ops build by default. ### Does this PR introduce _any_ user-facing change? Yes, users now install vllm-ascend from source will trigger custom ops build step. ### How was this patch tested? By image build and e2e CI --------- Signed-off-by: wangxiyuan --- .github/actionlint.yaml | 5 + .github/workflows/actionlint.yml | 2 + .github/workflows/image_openeuler.yml | 7 +- .../workflows/{image.yml => image_ubuntu.yml} | 11 +- .github/workflows/vllm_ascend_test.yaml | 282 ++---------------- CMakeLists.txt | 5 +- Dockerfile | 14 +- Dockerfile.openEuler | 17 +- docs/source/quick_start.md | 4 +- docs/source/tutorials/multi_node.md | 2 +- docs/source/user_guide/release_notes.md | 2 +- format.sh | 2 +- pta_install.sh | 4 +- pyproject.toml | 3 +- pytest.ini | 3 - requirements.txt | 2 +- setup.py | 34 +-- .../test_offline_inference_distributed.py | 55 ++++ .../test_offline_inference.py | 22 -- tools/actionlint.sh | 7 +- tools/shellcheck.sh | 3 - vllm_ascend/envs.py | 21 +- 22 files changed, 165 insertions(+), 342 deletions(-) create mode 100644 .github/actionlint.yaml rename .github/workflows/{image.yml => image_ubuntu.yml} (90%) create mode 100644 tests/multicard/test_offline_inference_distributed.py rename tests/{ => singlecard}/test_offline_inference.py (68%) diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml new file mode 100644 index 0000000..972abb3 --- /dev/null +++ b/.github/actionlint.yaml @@ -0,0 +1,5 @@ +self-hosted-runner: + # Labels of self-hosted runner in array of strings. + labels: + - linux-arm64-npu-1 + - linux-arm64-npu-4 diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 98b2146..294b814 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -46,6 +46,8 @@ jobs: fetch-depth: 0 - name: "Run actionlint" + env: + SHELLCHECK_OPTS: --exclude=SC2046,SC2006 run: | echo "::add-matcher::.github/workflows/matchers/actionlint.json" tools/actionlint.sh -color diff --git a/.github/workflows/image_openeuler.yml b/.github/workflows/image_openeuler.yml index ed2baf3..3e074d1 100644 --- a/.github/workflows/image_openeuler.yml +++ b/.github/workflows/image_openeuler.yml @@ -72,9 +72,6 @@ jobs: - name: Build - Set up QEMU uses: docker/setup-qemu-action@v3 - # TODO(yikun): remove this after https://github.com/docker/setup-qemu-action/issues/198 resolved - with: - image: tonistiigi/binfmt:qemu-v7.0.0-28 - name: Build - Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -98,3 +95,7 @@ jobs: labels: ${{ steps.meta.outputs.labels }} tags: ${{ steps.meta.outputs.tags }} file: Dockerfile.openEuler + # TODO: support and enable custom ops build for openEuler + build-args: | + PIP_INDEX_URL=https://pypi.org/simple + COMPILE_CUSTOM_KERNELS=0 diff --git a/.github/workflows/image.yml b/.github/workflows/image_ubuntu.yml similarity index 90% rename from .github/workflows/image.yml rename to .github/workflows/image_ubuntu.yml index 100471a..03fc316 100644 --- a/.github/workflows/image.yml +++ b/.github/workflows/image_ubuntu.yml @@ -16,7 +16,7 @@ on: - 'main' - '*-dev' paths: - - '.github/workflows/image.yml' + - '.github/workflows/image_ubuntu.yml' - 'Dockerfile' - 'vllm_ascend/**' push: @@ -27,13 +27,13 @@ on: tags: - 'v*' paths: - - '.github/workflows/image.yml' + - '.github/workflows/image_ubuntu.yml' - 'Dockerfile' - 'vllm_ascend/**' jobs: build: - name: vllm-ascend image + name: vllm-ascend Ubuntu image runs-on: ubuntu-latest steps: @@ -72,9 +72,6 @@ jobs: - name: Build - Set up QEMU uses: docker/setup-qemu-action@v3 - # TODO(yikun): remove this after https://github.com/docker/setup-qemu-action/issues/198 resolved - with: - image: tonistiigi/binfmt:qemu-v7.0.0-28 - name: Build - Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -98,4 +95,4 @@ jobs: labels: ${{ steps.meta.outputs.labels }} tags: ${{ steps.meta.outputs.tags }} build-args: | - PIP_INDEX_URL=https://pypi.org/simple + PIP_INDEX_URL=https://pypi.org/simple diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index e69eb82..3042c47 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -41,9 +41,14 @@ concurrency: cancel-in-progress: true jobs: - test-singlenpu: - name: vLLM Ascend test main(single-npu) - runs-on: linux-arm64-npu-1 # actionlint-ignore: runner-label + test: + strategy: + max-parallel: 2 + matrix: + os: [linux-arm64-npu-1, linux-arm64-npu-4] + vllm_verison: [main, v0.8.3] + name: vLLM Ascend test + runs-on: ${{ matrix.os }} container: image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10 steps: @@ -72,6 +77,7 @@ jobs: uses: actions/checkout@v4 with: repository: vllm-project/vllm + ref: ${{ matrix.vllm_verison }} path: ./vllm-empty - name: Install vllm-project/vllm from source @@ -79,11 +85,6 @@ jobs: run: | VLLM_TARGET_DEVICE=empty pip install -e . - - name: Install vllm-project/vllm-ascend - run: | - pip install -r requirements-dev.txt - pip install -e . - - name: Install pta run: | if [ ! -d /root/.cache/pta ]; then @@ -99,12 +100,23 @@ jobs: pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl + - name: Install vllm-project/vllm-ascend + run: | + pip install -r requirements-dev.txt + pip install -e . + - name: Run vllm-project/vllm-ascend test on V0 engine env: VLLM_USE_V1: 0 HF_ENDPOINT: https://hf-mirror.com run: | - VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests + if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then + pytest -sv tests/singlecard + pytest -sv tests/ops + else + pytest -sv tests/multicard + pytest -sv tests/ops + fi - name: Run vllm-project/vllm-ascend test for V1 Engine env: @@ -112,7 +124,13 @@ jobs: VLLM_WORKER_MULTIPROC_METHOD: spawn HF_ENDPOINT: https://hf-mirror.com run: | - pytest -sv -m 'not multinpu' tests + if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then + pytest -sv tests/singlecard + pytest -sv tests/ops + else + pytest -sv tests/multicard + pytest -sv tests/ops + fi - name: Run vllm-project/vllm test for V0 Engine env: @@ -121,247 +139,3 @@ jobs: HF_ENDPOINT: https://hf-mirror.com run: | pytest -sv - - test-multinpu: - name: vLLM Ascend test main(multi-npu) - runs-on: linux-arm64-npu-4 - container: - image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10 - env: - HF_ENDPOINT: https://hf-mirror.com - HF_TOKEN: ${{ secrets.HF_TOKEN }} - steps: - - name: Check npu and CANN info - run: | - npu-smi info - cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info - - - name: Config mirrors - run: | - # sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list - pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple - - - name: Install system dependencies - run: | - apt-get update -y - apt-get -y install git wget - - - name: Config git - run: | - git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ - - - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - pip install -r requirements-dev.txt - - - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v4 - with: - repository: vllm-project/vllm - path: ./vllm-empty - - - name: Install vllm-project/vllm from source - working-directory: ./vllm-empty - run: | - VLLM_TARGET_DEVICE=empty pip install -e . - - - name: Install vllm-project/vllm-ascend - run: | - pip install -r requirements-dev.txt - pip install -e . - - - name: Install pta - run: | - if [ ! -d /root/.cache/pta ]; then - mkdir -p /root/.cache/pta - fi - - if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then - cd /root/.cache/pta - rm -rf pytorch_v2.5.1_py310* - wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz - tar -zxvf pytorch_v2.5.1_py310.tar.gz - fi - - pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - - name: Run vllm-project/vllm-ascend test on V0 engine - env: - VLLM_USE_V1: 0 - HF_ENDPOINT: https://hf-mirror.com - run: | - VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests - - - name: Run vllm-project/vllm-ascend test for V1 Engine - env: - VLLM_USE_V1: 1 - VLLM_WORKER_MULTIPROC_METHOD: spawn - HF_ENDPOINT: https://hf-mirror.com - run: | - pytest -sv -m 'multinpu' tests - - test-singlenpu-v0_8_3: - name: vLLM Ascend test v0.8.3(single-npu) - runs-on: linux-arm64-npu-1 # actionlint-ignore: runner-label - container: - image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10 - steps: - - name: Check npu and CANN info - run: | - npu-smi info - cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info - - - name: Config mirrors - run: | - sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list - pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple - apt-get update -y - apt install git -y - git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ - - - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v4 - - - name: Install system dependencies - run: | - apt-get -y install `cat packages.txt` - apt-get -y install gcc g++ cmake libnuma-dev - - - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v4 - with: - repository: vllm-project/vllm - ref: v0.8.3 - path: ./vllm-empty - - - name: Install vllm-project/vllm from source - working-directory: ./vllm-empty - run: | - VLLM_TARGET_DEVICE=empty pip install -e . - - - name: Install vllm-project/vllm-ascend - run: | - pip install -r requirements-dev.txt - pip install -e . - - - name: Install pta - run: | - if [ ! -d /root/.cache/pta ]; then - mkdir -p /root/.cache/pta - fi - - if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then - cd /root/.cache/pta - rm -rf pytorch_v2.5.1_py310* - wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz - tar -zxvf pytorch_v2.5.1_py310.tar.gz - fi - - pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - - - name: Run vllm-project/vllm-ascend test on V0 engine - env: - VLLM_USE_V1: 0 - HF_ENDPOINT: https://hf-mirror.com - run: | - VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests - - - name: Run vllm-project/vllm-ascend test for V1 Engine - env: - VLLM_USE_V1: 1 - VLLM_WORKER_MULTIPROC_METHOD: spawn - HF_ENDPOINT: https://hf-mirror.com - run: | - pytest -sv -m 'not multinpu' tests - - - name: Run vllm-project/vllm test for V0 Engine - env: - VLLM_USE_V1: 0 - PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 - HF_ENDPOINT: https://hf-mirror.com - run: | - pytest -sv - - test-multinpu-v0_8_3: - name: vLLM Ascend test v0.8.3(multi-npu) - runs-on: linux-arm64-npu-4 - needs: test-multinpu - container: - image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10 - env: - HF_ENDPOINT: https://hf-mirror.com - HF_TOKEN: ${{ secrets.HF_TOKEN }} - steps: - - name: Check npu and CANN info - run: | - npu-smi info - cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info - - - name: Config mirrors - run: | - # sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list - pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple - - - name: Install system dependencies - run: | - apt-get update -y - apt-get -y install git wget - - - name: Config git - run: | - git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ - - - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - pip install -r requirements-dev.txt - - - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v4 - with: - repository: vllm-project/vllm - ref: v0.8.3 - path: ./vllm-empty - - - name: Install vllm-project/vllm from source - working-directory: ./vllm-empty - run: | - VLLM_TARGET_DEVICE=empty pip install -e . - - - name: Install vllm-project/vllm-ascend - run: | - pip install -r requirements-dev.txt - pip install -e . - - - name: Install pta - run: | - if [ ! -d /root/.cache/pta ]; then - mkdir -p /root/.cache/pta - fi - - if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then - cd /root/.cache/pta - rm -rf pytorch_v2.5.1_py310* - wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz - tar -zxvf pytorch_v2.5.1_py310.tar.gz - fi - - pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - - name: Run vllm-project/vllm-ascend test on V0 engine - env: - VLLM_USE_V1: 0 - HF_ENDPOINT: https://hf-mirror.com - run: | - VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests - - - name: Run vllm-project/vllm-ascend test for V1 Engine - env: - VLLM_USE_V1: 1 - VLLM_WORKER_MULTIPROC_METHOD: spawn - HF_ENDPOINT: https://hf-mirror.com - run: | - pytest -sv -m 'multinpu' tests \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 1814e4c..682b934 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ set(VLLM_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}") find_package(Torch REQUIRED) set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +set(SOC_VERSION ${SOC_VERSION}) message(STATUS "Detected SOC version: ${SOC_VERSION}") if (NOT CMAKE_BUILD_TYPE) @@ -49,10 +50,6 @@ ascendc_library(vllm_ascend_kernels SHARED ${KERNEL_FILES} ) -execute_process(COMMAND python3 -c "import os; import torch_npu; print(os.path.dirname(torch_npu.__file__))" - OUTPUT_STRIP_TRAILING_WHITESPACE - OUTPUT_VARIABLE TORCH_NPU_PATH -) message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}") file(GLOB VLLM_ASCEND_SRC diff --git a/Dockerfile b/Dockerfile index 2a0f93d..ad4d51e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,12 +18,14 @@ FROM quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 # Define environments ENV DEBIAN_FRONTEND=noninteractive +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} RUN apt-get update -y && \ - apt-get install -y python3-pip git vim wget net-tools && \ + apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \ rm -rf /var/cache/apt/* && \ rm -rf /var/lib/apt/lists/* @@ -41,12 +43,16 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/ --extra-i # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN python3 -m pip uninstall -y triton -# Install vllm-ascend -RUN python3 -m pip install /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ - # Install torch-npu RUN bash /workspace/vllm-ascend/pta_install.sh +# Install vllm-ascend +RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib:$LD_LIBRARY_PATH && \ + export LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:$LIBRARY_PATH && \ + python3 -m pip install -v /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ + # Install modelscope (for fast download) and ray (for multinode) RUN python3 -m pip install modelscope ray diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 1b3bfa3..967b666 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -17,11 +17,18 @@ FROM quay.io/ascend/cann:8.0.0-910b-openeuler22.03-py3.10 +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + RUN yum update -y && \ yum install -y python3-pip git vim wget net-tools && \ rm -rf /var/cache/yum &&\ rm -rf /tmp/* +RUN pip config set global.index-url ${PIP_INDEX_URL} + WORKDIR /workspace COPY . /workspace/vllm-ascend/ @@ -35,12 +42,16 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/ --extra-i # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN python3 -m pip uninstall -y triton -# Install vllm-ascend -RUN python3 -m pip install /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ - # Install torch-npu RUN bash /workspace/vllm-ascend/pta_install.sh +# Install vllm-ascend +RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib:$LD_LIBRARY_PATH && \ + export LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:$LIBRARY_PATH && \ + python3 -m pip install -v /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ + # Install modelscope (for fast download) and ray (for multinode) RUN python3 -m pip install modelscope ray diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 265d68e..76edd38 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -9,7 +9,7 @@ ## Setup environment using container :::::{tab-set} -::::{tab-item} Ubuntu OS +::::{tab-item} Ubuntu ```{code-block} bash :substitutions: @@ -35,7 +35,7 @@ docker run --rm \ ``` :::: -::::{tab-item} openEuler OS +::::{tab-item} openEuler ```{code-block} bash :substitutions: diff --git a/docs/source/tutorials/multi_node.md b/docs/source/tutorials/multi_node.md index fa367d7..35c8b38 100644 --- a/docs/source/tutorials/multi_node.md +++ b/docs/source/tutorials/multi_node.md @@ -166,7 +166,7 @@ python -m vllm.entrypoints.openai.api_server \ ``` :::{note} -If you're running DeepSeek V3/R1, please remove `quantization_config` section in `config.json` file since it's not supported by vllm-ascend currentlly. +If you're running DeepSeek V3/R1, please remove `quantization_config` section in `config.json` file since it's not supported by vllm-ascend currently. ::: Once your server is started, you can query the model with input prompts: diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md index fa04741..5504820 100644 --- a/docs/source/user_guide/release_notes.md +++ b/docs/source/user_guide/release_notes.md @@ -7,7 +7,7 @@ This is 2nd release candidate of v0.7.3 for vllm-ascend. Please follow the [offi - Installation: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/installation.html ### Highlights -- Add Ascend Custom Ops framewrok. Developers now can write customs ops using AscendC. An example ops `rotary_embedding` is added. More tutorials will come soon. The Custome Ops complation is disabled by default when installing vllm-ascend. Set `COMPILE_CUSTOM_KERNELS=1` to enable it. [#371](https://github.com/vllm-project/vllm-ascend/pull/371) +- Add Ascend Custom Ops framewrok. Developers now can write customs ops using AscendC. An example ops `rotary_embedding` is added. More tutorials will come soon. The Custom Ops compilation is disabled by default when installing vllm-ascend. Set `COMPILE_CUSTOM_KERNELS=1` to enable it. [#371](https://github.com/vllm-project/vllm-ascend/pull/371) - V1 engine is basic supported in this release. The full support will be done in 0.8.X release. If you hit any issue or have any requirement of V1 engine. Please tell us [here](https://github.com/vllm-project/vllm-ascend/issues/414). [#376](https://github.com/vllm-project/vllm-ascend/pull/376) - Prefix cache feature works now. You can set `enable_prefix_caching=True` to enable it. [#282](https://github.com/vllm-project/vllm-ascend/pull/282) diff --git a/format.sh b/format.sh index 1d0b940..32202aa 100755 --- a/format.sh +++ b/format.sh @@ -144,7 +144,7 @@ CODESPELL_EXCLUDES=( ) CODESPELL_IGNORE_WORDS=( - '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend' + '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue' ) # check spelling of specified files diff --git a/pta_install.sh b/pta_install.sh index 64c1b01..d72512c 100755 --- a/pta_install.sh +++ b/pta_install.sh @@ -7,9 +7,9 @@ tar -zxvf pytorch_v2.5.1_py310.tar.gz if [ "$(uname -i)" == "aarch64" ] then - pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl + python3 -m pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl else - pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --extra-index https://download.pytorch.org/whl/cpu/ + python3 -m pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --extra-index https://download.pytorch.org/whl/cpu/ fi cd .. diff --git a/pyproject.toml b/pyproject.toml index c73b9b3..f8855c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,12 +4,13 @@ requires = [ "cmake>=3.26", "decorator", "numpy<2.0.0", + "pip", "pybind11", "pyyaml", "scipy", "setuptools>=64", "setuptools-scm>=8", - "torch_npu >= 2.5.1rc1", + "torch_npu", "torch >= 2.5.1", "torchvision<0.21.0", ] diff --git a/pytest.ini b/pytest.ini index e2c9818..8889df7 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,8 +1,5 @@ [pytest] minversion = 6.0 -markers = - singlenpu: tests that run on single npu - multinpu: tests that run on multi npu norecursedirs = vllm-empty/tests/prefix_caching vllm-empty/tests/weight_loading diff --git a/requirements.txt b/requirements.txt index 3f3c0b0..e20b03e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,6 @@ pyyaml scipy setuptools>=64 setuptools-scm>=8 -torch_npu >= 2.5.1rc1 +torch_npu torch >= 2.5.1 torchvision<0.21.0 diff --git a/setup.py b/setup.py index 0bcfd88..912de3c 100644 --- a/setup.py +++ b/setup.py @@ -153,23 +153,6 @@ class cmake_build_ext(build_ext): # else specify pybind11 path installed from source code on CI container raise RuntimeError(f"CMake configuration failed: {e}") - # try retrive soc version from npu-smi - soc_command = [ - "bash", - "-c", - "npu-smi info | grep OK | awk '{print $3}' | head -n 1", - ] - try: - soc_version = subprocess.check_output(soc_command, - text=True).strip() - soc_version = soc_version.split("-")[0] - soc_version = "Ascend" + soc_version - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Retrive Soc version failed: {e}") - - # add SOC_VERSION - cmake_args += [f"-DSOC_VERSION={soc_version}"] - install_path = os.path.join(ROOT_DIR, self.build_lib) if isinstance(self.distribution.get_command_obj("develop"), develop): install_path = os.path.join(ROOT_DIR, "vllm_ascend") @@ -178,6 +161,8 @@ class cmake_build_ext(build_ext): cmake_args += [f"-DCMAKE_PREFIX_PATH={pybind11_cmake_path}"] + cmake_args += [f"-DSOC_VERSION={envs.SOC_VERSION}"] + # Override the base directory for FetchContent downloads to $ROOT/.deps # This allows sharing dependencies between profiles, # and plays more nicely with sccache. @@ -186,6 +171,17 @@ class cmake_build_ext(build_ext): fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir) cmake_args += ["-DFETCHCONTENT_BASE_DIR={}".format(fc_base_dir)] + torch_npu_command = "python3 -m pip show torch-npu | grep '^Location:' | awk '{print $2}'" + try: + torch_npu_path = subprocess.check_output( + torch_npu_command, shell=True).decode().strip() + torch_npu_path += "/torch_npu" + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Retrieve torch version version failed: {e}") + + # add TORCH_NPU_PATH + cmake_args += [f"-DTORCH_NPU_PATH={torch_npu_path}"] + build_tool = [] # TODO(ganyi): ninja and ccache support for ascend c auto codegen. now we can only use make build # if which('ninja') is not None: @@ -205,7 +201,7 @@ class cmake_build_ext(build_ext): ) def build_extensions(self) -> None: - if envs.COMPILE_CUSTOM_KERNELS is None: + if not envs.COMPILE_CUSTOM_KERNELS: return # Ensure that CMake is present and working try: @@ -285,7 +281,7 @@ except LookupError: VERSION = "0.0.0" ext_modules = [] -if envs.COMPILE_CUSTOM_KERNELS is not None: +if envs.COMPILE_CUSTOM_KERNELS: ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")] diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py new file mode 100644 index 0000000..1304001 --- /dev/null +++ b/tests/multicard/test_offline_inference_distributed.py @@ -0,0 +1,55 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Compare the short outputs of HF and vLLM when using greedy sampling. + +Run `pytest tests/test_offline_inference.py`. +""" +import os + +import pytest +import vllm # noqa: F401 +from conftest import VllmRunner + +os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" + + +@pytest.mark.parametrize("model, distributed_executor_backend", [ + ("Qwen/QwQ-32B", "mp"), +]) +def test_models_distributed(model: str, + distributed_executor_backend: str) -> None: + example_prompts = [ + "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", + "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.", + "Compare and contrast artificial intelligence with human intelligence in terms of processing information.", + ] + dtype = "half" + max_tokens = 5 + with VllmRunner( + model, + dtype=dtype, + tensor_parallel_size=4, + distributed_executor_backend=distributed_executor_backend, + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + + +if __name__ == "__main__": + import pytest + pytest.main([__file__]) diff --git a/tests/test_offline_inference.py b/tests/singlecard/test_offline_inference.py similarity index 68% rename from tests/test_offline_inference.py rename to tests/singlecard/test_offline_inference.py index ecff067..3c17605 100644 --- a/tests/test_offline_inference.py +++ b/tests/singlecard/test_offline_inference.py @@ -53,28 +53,6 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None: vllm_model.generate_greedy(example_prompts, max_tokens) -@pytest.mark.multinpu -@pytest.mark.parametrize("model, distributed_executor_backend", [ - ("Qwen/QwQ-32B", "mp"), -]) -def test_models_distributed(vllm_runner, model: str, - distributed_executor_backend: str) -> None: - example_prompts = [ - "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", - "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.", - "Compare and contrast artificial intelligence with human intelligence in terms of processing information.", - ] - dtype = "half" - max_tokens = 5 - with vllm_runner( - model, - dtype=dtype, - tensor_parallel_size=4, - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - if __name__ == "__main__": import pytest pytest.main([__file__]) diff --git a/tools/actionlint.sh b/tools/actionlint.sh index d97b4bb..482d866 100755 --- a/tools/actionlint.sh +++ b/tools/actionlint.sh @@ -20,14 +20,13 @@ # if command -v actionlint &> /dev/null; then - # NOTE: avoid check .github/workflows/vllm_ascend_test.yaml because sel-hosted runner `npu-arm64` is unknown - actionlint .github/workflows/*.yml .github/workflows/mypy.yaml + actionlint .github/workflows/*.yml .github/workflows/*.yaml exit 0 elif [ -x ./actionlint ]; then - ./actionlint .github/workflows/*.yml .github/workflows/mypy.yaml + ./actionlint .github/workflows/*.yml .github/workflows/*.yaml exit 0 fi # download a binary to the current directory - v1.7.3 bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash) -./actionlint .github/workflows/*.yml .github/workflows/mypy.yaml +./actionlint .github/workflows/*.yml .github/workflows/*.yaml diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh index d782af7..1194e65 100755 --- a/tools/shellcheck.sh +++ b/tools/shellcheck.sh @@ -39,6 +39,3 @@ if ! [ -x "$(command -v shellcheck)" ]; then PATH="$PATH:$(pwd)/shellcheck-${scversion}" export PATH fi - -# TODO - fix warnings in .buildkite/run-amd-test.sh -find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"' diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 014bfd7..fb88c3b 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -3,14 +3,21 @@ from typing import Any, Callable, Dict env_variables: Dict[str, Callable[[], Any]] = { # max compile thread num - "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None), - "CMAKE_BUILD_TYPE": lambda: os.getenv("CMAKE_BUILD_TYPE"), + "MAX_JOBS": + lambda: os.getenv("MAX_JOBS", None), + "CMAKE_BUILD_TYPE": + lambda: os.getenv("CMAKE_BUILD_TYPE"), "COMPILE_CUSTOM_KERNELS": - lambda: os.getenv("COMPILE_CUSTOM_KERNELS", None), - # If set, vllm-ascend will print verbose logs during compliation - "VERBOSE": lambda: bool(int(os.getenv('VERBOSE', '0'))), - "ASCEND_HOME_PATH": lambda: os.getenv("ASCEND_HOME_PATH", None), - "LD_LIBRARY_PATH": lambda: os.getenv("LD_LIBRARY_PATH", None), + lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))), + "SOC_VERSION": + lambda: os.getenv("SOC_VERSION", "ASCEND910B1"), + # If set, vllm-ascend will print verbose logs during compilation + "VERBOSE": + lambda: bool(int(os.getenv('VERBOSE', '0'))), + "ASCEND_HOME_PATH": + lambda: os.getenv("ASCEND_HOME_PATH", None), + "LD_LIBRARY_PATH": + lambda: os.getenv("LD_LIBRARY_PATH", None), }