[CI] enable custom ops build (#466)

### What this PR does / why we need it?
This PR enable custom ops build  by default. 

### Does this PR introduce _any_ user-facing change?

Yes, users now install vllm-ascend from source will trigger custom ops
build step.

### How was this patch tested?
By image build and e2e CI

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-04-12 10:24:53 +08:00
committed by GitHub
parent d05ea17427
commit 9c7428b3d5
22 changed files with 165 additions and 342 deletions

5
.github/actionlint.yaml vendored Normal file
View File

@@ -0,0 +1,5 @@
self-hosted-runner:
# Labels of self-hosted runner in array of strings.
labels:
- linux-arm64-npu-1
- linux-arm64-npu-4

View File

@@ -46,6 +46,8 @@ jobs:
fetch-depth: 0
- name: "Run actionlint"
env:
SHELLCHECK_OPTS: --exclude=SC2046,SC2006
run: |
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
tools/actionlint.sh -color

View File

@@ -72,9 +72,6 @@ jobs:
- name: Build - Set up QEMU
uses: docker/setup-qemu-action@v3
# TODO(yikun): remove this after https://github.com/docker/setup-qemu-action/issues/198 resolved
with:
image: tonistiigi/binfmt:qemu-v7.0.0-28
- name: Build - Set up Docker Buildx
uses: docker/setup-buildx-action@v3
@@ -98,3 +95,7 @@ jobs:
labels: ${{ steps.meta.outputs.labels }}
tags: ${{ steps.meta.outputs.tags }}
file: Dockerfile.openEuler
# TODO: support and enable custom ops build for openEuler
build-args: |
PIP_INDEX_URL=https://pypi.org/simple
COMPILE_CUSTOM_KERNELS=0

View File

@@ -16,7 +16,7 @@ on:
- 'main'
- '*-dev'
paths:
- '.github/workflows/image.yml'
- '.github/workflows/image_ubuntu.yml'
- 'Dockerfile'
- 'vllm_ascend/**'
push:
@@ -27,13 +27,13 @@ on:
tags:
- 'v*'
paths:
- '.github/workflows/image.yml'
- '.github/workflows/image_ubuntu.yml'
- 'Dockerfile'
- 'vllm_ascend/**'
jobs:
build:
name: vllm-ascend image
name: vllm-ascend Ubuntu image
runs-on: ubuntu-latest
steps:
@@ -72,9 +72,6 @@ jobs:
- name: Build - Set up QEMU
uses: docker/setup-qemu-action@v3
# TODO(yikun): remove this after https://github.com/docker/setup-qemu-action/issues/198 resolved
with:
image: tonistiigi/binfmt:qemu-v7.0.0-28
- name: Build - Set up Docker Buildx
uses: docker/setup-buildx-action@v3
@@ -98,4 +95,4 @@ jobs:
labels: ${{ steps.meta.outputs.labels }}
tags: ${{ steps.meta.outputs.tags }}
build-args: |
PIP_INDEX_URL=https://pypi.org/simple
PIP_INDEX_URL=https://pypi.org/simple

View File

@@ -41,9 +41,14 @@ concurrency:
cancel-in-progress: true
jobs:
test-singlenpu:
name: vLLM Ascend test main(single-npu)
runs-on: linux-arm64-npu-1 # actionlint-ignore: runner-label
test:
strategy:
max-parallel: 2
matrix:
os: [linux-arm64-npu-1, linux-arm64-npu-4]
vllm_verison: [main, v0.8.3]
name: vLLM Ascend test
runs-on: ${{ matrix.os }}
container:
image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
steps:
@@ -72,6 +77,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
ref: ${{ matrix.vllm_verison }}
path: ./vllm-empty
- name: Install vllm-project/vllm from source
@@ -79,11 +85,6 @@ jobs:
run: |
VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend
run: |
pip install -r requirements-dev.txt
pip install -e .
- name: Install pta
run: |
if [ ! -d /root/.cache/pta ]; then
@@ -99,12 +100,23 @@ jobs:
pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
- name: Install vllm-project/vllm-ascend
run: |
pip install -r requirements-dev.txt
pip install -e .
- name: Run vllm-project/vllm-ascend test on V0 engine
env:
VLLM_USE_V1: 0
HF_ENDPOINT: https://hf-mirror.com
run: |
VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
pytest -sv tests/singlecard
pytest -sv tests/ops
else
pytest -sv tests/multicard
pytest -sv tests/ops
fi
- name: Run vllm-project/vllm-ascend test for V1 Engine
env:
@@ -112,7 +124,13 @@ jobs:
VLLM_WORKER_MULTIPROC_METHOD: spawn
HF_ENDPOINT: https://hf-mirror.com
run: |
pytest -sv -m 'not multinpu' tests
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
pytest -sv tests/singlecard
pytest -sv tests/ops
else
pytest -sv tests/multicard
pytest -sv tests/ops
fi
- name: Run vllm-project/vllm test for V0 Engine
env:
@@ -121,247 +139,3 @@ jobs:
HF_ENDPOINT: https://hf-mirror.com
run: |
pytest -sv
test-multinpu:
name: vLLM Ascend test main(multi-npu)
runs-on: linux-arm64-npu-4
container:
image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10
env:
HF_ENDPOINT: https://hf-mirror.com
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps:
- name: Check npu and CANN info
run: |
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
- name: Config mirrors
run: |
# sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
- name: Install system dependencies
run: |
apt-get update -y
apt-get -y install git wget
- name: Config git
run: |
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
- name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v4
- name: Install dependencies
run: |
pip install -r requirements-dev.txt
- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
path: ./vllm-empty
- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: |
VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend
run: |
pip install -r requirements-dev.txt
pip install -e .
- name: Install pta
run: |
if [ ! -d /root/.cache/pta ]; then
mkdir -p /root/.cache/pta
fi
if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
cd /root/.cache/pta
rm -rf pytorch_v2.5.1_py310*
wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
tar -zxvf pytorch_v2.5.1_py310.tar.gz
fi
pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
- name: Run vllm-project/vllm-ascend test on V0 engine
env:
VLLM_USE_V1: 0
HF_ENDPOINT: https://hf-mirror.com
run: |
VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests
- name: Run vllm-project/vllm-ascend test for V1 Engine
env:
VLLM_USE_V1: 1
VLLM_WORKER_MULTIPROC_METHOD: spawn
HF_ENDPOINT: https://hf-mirror.com
run: |
pytest -sv -m 'multinpu' tests
test-singlenpu-v0_8_3:
name: vLLM Ascend test v0.8.3(single-npu)
runs-on: linux-arm64-npu-1 # actionlint-ignore: runner-label
container:
image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
steps:
- name: Check npu and CANN info
run: |
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
- name: Config mirrors
run: |
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
apt-get update -y
apt install git -y
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
- name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v4
- name: Install system dependencies
run: |
apt-get -y install `cat packages.txt`
apt-get -y install gcc g++ cmake libnuma-dev
- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
ref: v0.8.3
path: ./vllm-empty
- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: |
VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend
run: |
pip install -r requirements-dev.txt
pip install -e .
- name: Install pta
run: |
if [ ! -d /root/.cache/pta ]; then
mkdir -p /root/.cache/pta
fi
if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
cd /root/.cache/pta
rm -rf pytorch_v2.5.1_py310*
wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
tar -zxvf pytorch_v2.5.1_py310.tar.gz
fi
pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
- name: Run vllm-project/vllm-ascend test on V0 engine
env:
VLLM_USE_V1: 0
HF_ENDPOINT: https://hf-mirror.com
run: |
VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests
- name: Run vllm-project/vllm-ascend test for V1 Engine
env:
VLLM_USE_V1: 1
VLLM_WORKER_MULTIPROC_METHOD: spawn
HF_ENDPOINT: https://hf-mirror.com
run: |
pytest -sv -m 'not multinpu' tests
- name: Run vllm-project/vllm test for V0 Engine
env:
VLLM_USE_V1: 0
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
HF_ENDPOINT: https://hf-mirror.com
run: |
pytest -sv
test-multinpu-v0_8_3:
name: vLLM Ascend test v0.8.3(multi-npu)
runs-on: linux-arm64-npu-4
needs: test-multinpu
container:
image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10
env:
HF_ENDPOINT: https://hf-mirror.com
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps:
- name: Check npu and CANN info
run: |
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
- name: Config mirrors
run: |
# sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
- name: Install system dependencies
run: |
apt-get update -y
apt-get -y install git wget
- name: Config git
run: |
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
- name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v4
- name: Install dependencies
run: |
pip install -r requirements-dev.txt
- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
ref: v0.8.3
path: ./vllm-empty
- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: |
VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend
run: |
pip install -r requirements-dev.txt
pip install -e .
- name: Install pta
run: |
if [ ! -d /root/.cache/pta ]; then
mkdir -p /root/.cache/pta
fi
if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
cd /root/.cache/pta
rm -rf pytorch_v2.5.1_py310*
wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
tar -zxvf pytorch_v2.5.1_py310.tar.gz
fi
pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
- name: Run vllm-project/vllm-ascend test on V0 engine
env:
VLLM_USE_V1: 0
HF_ENDPOINT: https://hf-mirror.com
run: |
VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests
- name: Run vllm-project/vllm-ascend test for V1 Engine
env:
VLLM_USE_V1: 1
VLLM_WORKER_MULTIPROC_METHOD: spawn
HF_ENDPOINT: https://hf-mirror.com
run: |
pytest -sv -m 'multinpu' tests

View File

@@ -20,6 +20,7 @@ set(VLLM_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}")
find_package(Torch REQUIRED)
set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
set(SOC_VERSION ${SOC_VERSION})
message(STATUS "Detected SOC version: ${SOC_VERSION}")
if (NOT CMAKE_BUILD_TYPE)
@@ -49,10 +50,6 @@ ascendc_library(vllm_ascend_kernels SHARED
${KERNEL_FILES}
)
execute_process(COMMAND python3 -c "import os; import torch_npu; print(os.path.dirname(torch_npu.__file__))"
OUTPUT_STRIP_TRAILING_WHITESPACE
OUTPUT_VARIABLE TORCH_NPU_PATH
)
message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
file(GLOB VLLM_ASCEND_SRC

View File

@@ -18,12 +18,14 @@
FROM quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN apt-get update -y && \
apt-get install -y python3-pip git vim wget net-tools && \
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
@@ -41,12 +43,16 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/ --extra-i
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN python3 -m pip uninstall -y triton
# Install vllm-ascend
RUN python3 -m pip install /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
# Install torch-npu
RUN bash /workspace/vllm-ascend/pta_install.sh
# Install vllm-ascend
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib:$LD_LIBRARY_PATH && \
export LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:$LIBRARY_PATH && \
python3 -m pip install -v /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope ray

View File

@@ -17,11 +17,18 @@
FROM quay.io/ascend/cann:8.0.0-910b-openeuler22.03-py3.10
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN yum update -y && \
yum install -y python3-pip git vim wget net-tools && \
rm -rf /var/cache/yum &&\
rm -rf /tmp/*
RUN pip config set global.index-url ${PIP_INDEX_URL}
WORKDIR /workspace
COPY . /workspace/vllm-ascend/
@@ -35,12 +42,16 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/ --extra-i
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN python3 -m pip uninstall -y triton
# Install vllm-ascend
RUN python3 -m pip install /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
# Install torch-npu
RUN bash /workspace/vllm-ascend/pta_install.sh
# Install vllm-ascend
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib:$LD_LIBRARY_PATH && \
export LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:$LIBRARY_PATH && \
python3 -m pip install -v /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope ray

View File

@@ -9,7 +9,7 @@
## Setup environment using container
:::::{tab-set}
::::{tab-item} Ubuntu OS
::::{tab-item} Ubuntu
```{code-block} bash
:substitutions:
@@ -35,7 +35,7 @@ docker run --rm \
```
::::
::::{tab-item} openEuler OS
::::{tab-item} openEuler
```{code-block} bash
:substitutions:

View File

@@ -166,7 +166,7 @@ python -m vllm.entrypoints.openai.api_server \
```
:::{note}
If you're running DeepSeek V3/R1, please remove `quantization_config` section in `config.json` file since it's not supported by vllm-ascend currentlly.
If you're running DeepSeek V3/R1, please remove `quantization_config` section in `config.json` file since it's not supported by vllm-ascend currently.
:::
Once your server is started, you can query the model with input prompts:

View File

@@ -7,7 +7,7 @@ This is 2nd release candidate of v0.7.3 for vllm-ascend. Please follow the [offi
- Installation: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/installation.html
### Highlights
- Add Ascend Custom Ops framewrok. Developers now can write customs ops using AscendC. An example ops `rotary_embedding` is added. More tutorials will come soon. The Custome Ops complation is disabled by default when installing vllm-ascend. Set `COMPILE_CUSTOM_KERNELS=1` to enable it. [#371](https://github.com/vllm-project/vllm-ascend/pull/371)
- Add Ascend Custom Ops framewrok. Developers now can write customs ops using AscendC. An example ops `rotary_embedding` is added. More tutorials will come soon. The Custom Ops compilation is disabled by default when installing vllm-ascend. Set `COMPILE_CUSTOM_KERNELS=1` to enable it. [#371](https://github.com/vllm-project/vllm-ascend/pull/371)
- V1 engine is basic supported in this release. The full support will be done in 0.8.X release. If you hit any issue or have any requirement of V1 engine. Please tell us [here](https://github.com/vllm-project/vllm-ascend/issues/414). [#376](https://github.com/vllm-project/vllm-ascend/pull/376)
- Prefix cache feature works now. You can set `enable_prefix_caching=True` to enable it. [#282](https://github.com/vllm-project/vllm-ascend/pull/282)

View File

@@ -144,7 +144,7 @@ CODESPELL_EXCLUDES=(
)
CODESPELL_IGNORE_WORDS=(
'-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend'
'-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue'
)
# check spelling of specified files

View File

@@ -7,9 +7,9 @@ tar -zxvf pytorch_v2.5.1_py310.tar.gz
if [ "$(uname -i)" == "aarch64" ]
then
pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
python3 -m pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
else
pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --extra-index https://download.pytorch.org/whl/cpu/
python3 -m pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --extra-index https://download.pytorch.org/whl/cpu/
fi
cd ..

View File

@@ -4,12 +4,13 @@ requires = [
"cmake>=3.26",
"decorator",
"numpy<2.0.0",
"pip",
"pybind11",
"pyyaml",
"scipy",
"setuptools>=64",
"setuptools-scm>=8",
"torch_npu >= 2.5.1rc1",
"torch_npu",
"torch >= 2.5.1",
"torchvision<0.21.0",
]

View File

@@ -1,8 +1,5 @@
[pytest]
minversion = 6.0
markers =
singlenpu: tests that run on single npu
multinpu: tests that run on multi npu
norecursedirs =
vllm-empty/tests/prefix_caching
vllm-empty/tests/weight_loading

View File

@@ -7,6 +7,6 @@ pyyaml
scipy
setuptools>=64
setuptools-scm>=8
torch_npu >= 2.5.1rc1
torch_npu
torch >= 2.5.1
torchvision<0.21.0

View File

@@ -153,23 +153,6 @@ class cmake_build_ext(build_ext):
# else specify pybind11 path installed from source code on CI container
raise RuntimeError(f"CMake configuration failed: {e}")
# try retrive soc version from npu-smi
soc_command = [
"bash",
"-c",
"npu-smi info | grep OK | awk '{print $3}' | head -n 1",
]
try:
soc_version = subprocess.check_output(soc_command,
text=True).strip()
soc_version = soc_version.split("-")[0]
soc_version = "Ascend" + soc_version
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Retrive Soc version failed: {e}")
# add SOC_VERSION
cmake_args += [f"-DSOC_VERSION={soc_version}"]
install_path = os.path.join(ROOT_DIR, self.build_lib)
if isinstance(self.distribution.get_command_obj("develop"), develop):
install_path = os.path.join(ROOT_DIR, "vllm_ascend")
@@ -178,6 +161,8 @@ class cmake_build_ext(build_ext):
cmake_args += [f"-DCMAKE_PREFIX_PATH={pybind11_cmake_path}"]
cmake_args += [f"-DSOC_VERSION={envs.SOC_VERSION}"]
# Override the base directory for FetchContent downloads to $ROOT/.deps
# This allows sharing dependencies between profiles,
# and plays more nicely with sccache.
@@ -186,6 +171,17 @@ class cmake_build_ext(build_ext):
fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
cmake_args += ["-DFETCHCONTENT_BASE_DIR={}".format(fc_base_dir)]
torch_npu_command = "python3 -m pip show torch-npu | grep '^Location:' | awk '{print $2}'"
try:
torch_npu_path = subprocess.check_output(
torch_npu_command, shell=True).decode().strip()
torch_npu_path += "/torch_npu"
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Retrieve torch version version failed: {e}")
# add TORCH_NPU_PATH
cmake_args += [f"-DTORCH_NPU_PATH={torch_npu_path}"]
build_tool = []
# TODO(ganyi): ninja and ccache support for ascend c auto codegen. now we can only use make build
# if which('ninja') is not None:
@@ -205,7 +201,7 @@ class cmake_build_ext(build_ext):
)
def build_extensions(self) -> None:
if envs.COMPILE_CUSTOM_KERNELS is None:
if not envs.COMPILE_CUSTOM_KERNELS:
return
# Ensure that CMake is present and working
try:
@@ -285,7 +281,7 @@ except LookupError:
VERSION = "0.0.0"
ext_modules = []
if envs.COMPILE_CUSTOM_KERNELS is not None:
if envs.COMPILE_CUSTOM_KERNELS:
ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")]

View File

@@ -0,0 +1,55 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Compare the short outputs of HF and vLLM when using greedy sampling.
Run `pytest tests/test_offline_inference.py`.
"""
import os
import pytest
import vllm # noqa: F401
from conftest import VllmRunner
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
@pytest.mark.parametrize("model, distributed_executor_backend", [
("Qwen/QwQ-32B", "mp"),
])
def test_models_distributed(model: str,
distributed_executor_backend: str) -> None:
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
]
dtype = "half"
max_tokens = 5
with VllmRunner(
model,
dtype=dtype,
tensor_parallel_size=4,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
if __name__ == "__main__":
import pytest
pytest.main([__file__])

View File

@@ -53,28 +53,6 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
vllm_model.generate_greedy(example_prompts, max_tokens)
@pytest.mark.multinpu
@pytest.mark.parametrize("model, distributed_executor_backend", [
("Qwen/QwQ-32B", "mp"),
])
def test_models_distributed(vllm_runner, model: str,
distributed_executor_backend: str) -> None:
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
]
dtype = "half"
max_tokens = 5
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=4,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
if __name__ == "__main__":
import pytest
pytest.main([__file__])

View File

@@ -20,14 +20,13 @@
#
if command -v actionlint &> /dev/null; then
# NOTE: avoid check .github/workflows/vllm_ascend_test.yaml because sel-hosted runner `npu-arm64` is unknown
actionlint .github/workflows/*.yml .github/workflows/mypy.yaml
actionlint .github/workflows/*.yml .github/workflows/*.yaml
exit 0
elif [ -x ./actionlint ]; then
./actionlint .github/workflows/*.yml .github/workflows/mypy.yaml
./actionlint .github/workflows/*.yml .github/workflows/*.yaml
exit 0
fi
# download a binary to the current directory - v1.7.3
bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
./actionlint .github/workflows/*.yml .github/workflows/mypy.yaml
./actionlint .github/workflows/*.yml .github/workflows/*.yaml

View File

@@ -39,6 +39,3 @@ if ! [ -x "$(command -v shellcheck)" ]; then
PATH="$PATH:$(pwd)/shellcheck-${scversion}"
export PATH
fi
# TODO - fix warnings in .buildkite/run-amd-test.sh
find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"'

View File

@@ -3,14 +3,21 @@ from typing import Any, Callable, Dict
env_variables: Dict[str, Callable[[], Any]] = {
# max compile thread num
"MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
"CMAKE_BUILD_TYPE": lambda: os.getenv("CMAKE_BUILD_TYPE"),
"MAX_JOBS":
lambda: os.getenv("MAX_JOBS", None),
"CMAKE_BUILD_TYPE":
lambda: os.getenv("CMAKE_BUILD_TYPE"),
"COMPILE_CUSTOM_KERNELS":
lambda: os.getenv("COMPILE_CUSTOM_KERNELS", None),
# If set, vllm-ascend will print verbose logs during compliation
"VERBOSE": lambda: bool(int(os.getenv('VERBOSE', '0'))),
"ASCEND_HOME_PATH": lambda: os.getenv("ASCEND_HOME_PATH", None),
"LD_LIBRARY_PATH": lambda: os.getenv("LD_LIBRARY_PATH", None),
lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
"SOC_VERSION":
lambda: os.getenv("SOC_VERSION", "ASCEND910B1"),
# If set, vllm-ascend will print verbose logs during compilation
"VERBOSE":
lambda: bool(int(os.getenv('VERBOSE', '0'))),
"ASCEND_HOME_PATH":
lambda: os.getenv("ASCEND_HOME_PATH", None),
"LD_LIBRARY_PATH":
lambda: os.getenv("LD_LIBRARY_PATH", None),
}