[CI] enable custom ops build (#466)

### What this PR does / why we need it? This PR enable custom ops build by default. ### Does this PR introduce _any_ user-facing change? Yes, users now install vllm-ascend from source will trigger custom ops build step. ### How was this patch tested? By image build and e2e CI --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-04-12 10:24:53 +08:00
parent d05ea17427
commit 9c7428b3d5
22 changed files with 165 additions and 342 deletions
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -0,0 +1,5 @@
+self-hosted-runner:
+  # Labels of self-hosted runner in array of strings.
+  labels:
+    - linux-arm64-npu-1
+    - linux-arm64-npu-4
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -46,6 +46,8 @@ jobs:
          fetch-depth: 0

      - name: "Run actionlint"
+        env:
+          SHELLCHECK_OPTS: --exclude=SC2046,SC2006
        run: |
          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
          tools/actionlint.sh -color
--- a/.github/workflows/image_openeuler.yml
+++ b/.github/workflows/image_openeuler.yml
@@ -72,9 +72,6 @@ jobs:

    - name: Build - Set up QEMU
      uses: docker/setup-qemu-action@v3
-      # TODO(yikun): remove this after https://github.com/docker/setup-qemu-action/issues/198 resolved
-      with:
-        image: tonistiigi/binfmt:qemu-v7.0.0-28

    - name: Build - Set up Docker Buildx
      uses: docker/setup-buildx-action@v3
@@ -98,3 +95,7 @@ jobs:
        labels: ${{ steps.meta.outputs.labels }}
        tags: ${{ steps.meta.outputs.tags }}
        file: Dockerfile.openEuler
+        # TODO: support and enable custom ops build for openEuler
+        build-args: |
+          PIP_INDEX_URL=https://pypi.org/simple
+          COMPILE_CUSTOM_KERNELS=0
--- a/.github/workflows/image_ubuntu.yml
+++ b/.github/workflows/image_ubuntu.yml
@@ -16,7 +16,7 @@ on:
      - 'main'
      - '*-dev'
    paths:
-      - '.github/workflows/image.yml'
+      - '.github/workflows/image_ubuntu.yml'
      - 'Dockerfile'
      - 'vllm_ascend/**'
  push:
@@ -27,13 +27,13 @@ on:
    tags:
      - 'v*'
    paths:
-      - '.github/workflows/image.yml'
+      - '.github/workflows/image_ubuntu.yml'
      - 'Dockerfile'
      - 'vllm_ascend/**'
 jobs:

  build:
-    name: vllm-ascend image
+    name: vllm-ascend Ubuntu image
    runs-on: ubuntu-latest

    steps:
@@ -72,9 +72,6 @@ jobs:

    - name: Build - Set up QEMU
      uses: docker/setup-qemu-action@v3
-      # TODO(yikun): remove this after https://github.com/docker/setup-qemu-action/issues/198 resolved
-      with:
-        image: tonistiigi/binfmt:qemu-v7.0.0-28

    - name: Build - Set up Docker Buildx
      uses: docker/setup-buildx-action@v3
@@ -98,4 +95,4 @@ jobs:
        labels: ${{ steps.meta.outputs.labels }}
        tags: ${{ steps.meta.outputs.tags }}
        build-args: |
-            PIP_INDEX_URL=https://pypi.org/simple
+          PIP_INDEX_URL=https://pypi.org/simple
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -41,9 +41,14 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  test-singlenpu:
-    name: vLLM Ascend test main(single-npu)
-    runs-on: linux-arm64-npu-1  # actionlint-ignore: runner-label
+  test:
+    strategy:
+      max-parallel: 2
+      matrix:
+        os: [linux-arm64-npu-1, linux-arm64-npu-4]
+        vllm_verison: [main, v0.8.3]
+    name: vLLM Ascend test
+    runs-on: ${{ matrix.os }}
    container:
      image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
    steps:
@@ -72,6 +77,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          repository: vllm-project/vllm
+          ref: ${{ matrix.vllm_verison }}
          path: ./vllm-empty

      - name: Install vllm-project/vllm from source
@@ -79,11 +85,6 @@ jobs:
        run: |
          VLLM_TARGET_DEVICE=empty pip install -e .

-      - name: Install vllm-project/vllm-ascend
-        run: |
-          pip install -r requirements-dev.txt
-          pip install -e .
-
      - name: Install pta
        run: |
          if [ ! -d /root/.cache/pta ]; then
@@ -99,12 +100,23 @@ jobs:

          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

+      - name: Install vllm-project/vllm-ascend
+        run: |
+          pip install -r requirements-dev.txt
+          pip install -e .
+
      - name: Run vllm-project/vllm-ascend test on V0 engine
        env:
          VLLM_USE_V1: 0
          HF_ENDPOINT: https://hf-mirror.com
        run: |
-          VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests
+          if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
+            pytest -sv tests/singlecard
+            pytest -sv tests/ops
+          else
+            pytest -sv tests/multicard
+            pytest -sv tests/ops
+          fi

      - name: Run vllm-project/vllm-ascend test for V1 Engine
        env:
@@ -112,7 +124,13 @@ jobs:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
          HF_ENDPOINT: https://hf-mirror.com
        run: |
-          pytest -sv -m 'not multinpu' tests
+          if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
+            pytest -sv tests/singlecard
+            pytest -sv tests/ops
+          else
+            pytest -sv tests/multicard
+            pytest -sv tests/ops
+          fi

      - name: Run vllm-project/vllm test for V0 Engine
        env:
@@ -121,247 +139,3 @@ jobs:
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          pytest -sv
-
-  test-multinpu:
-    name: vLLM Ascend test main(multi-npu)
-    runs-on: linux-arm64-npu-4  
-    container:
-      image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10
-      env:
-        HF_ENDPOINT: https://hf-mirror.com
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-    steps:
-      - name: Check npu and CANN info
-        run: |
-          npu-smi info
-          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
-
-      - name: Config mirrors
-        run: |
-          # sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
-          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-      - name: Install system dependencies
-        run: |
-          apt-get update -y
-          apt-get -y install git wget
-
-      - name: Config git
-        run: |
-          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ 
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v4
-
-      - name: Install dependencies
-        run: |
-          pip install -r requirements-dev.txt
-
-      - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v4
-        with:
-          repository: vllm-project/vllm
-          path: ./vllm-empty
-
-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
-        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
-
-      - name: Install vllm-project/vllm-ascend
-        run: |
-          pip install -r requirements-dev.txt
-          pip install -e .
-
-      - name: Install pta
-        run: |
-          if [ ! -d /root/.cache/pta ]; then
-            mkdir -p /root/.cache/pta
-          fi
-
-          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
-            cd /root/.cache/pta
-            rm -rf pytorch_v2.5.1_py310*
-            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
-            tar -zxvf pytorch_v2.5.1_py310.tar.gz
-          fi
-
-          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
-      - name: Run vllm-project/vllm-ascend test on V0 engine
-        env:
-          VLLM_USE_V1: 0
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests
-
-      - name: Run vllm-project/vllm-ascend test for V1 Engine
-        env:
-          VLLM_USE_V1: 1
-          VLLM_WORKER_MULTIPROC_METHOD: spawn
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          pytest -sv -m 'multinpu' tests
-
-  test-singlenpu-v0_8_3:
-    name: vLLM Ascend test v0.8.3(single-npu)
-    runs-on: linux-arm64-npu-1  # actionlint-ignore: runner-label
-    container:
-      image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
-    steps:
-      - name: Check npu and CANN info
-        run: |
-          npu-smi info
-          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
-
-      - name: Config mirrors
-        run: |
-          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
-          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-          apt-get update -y
-          apt install git -y
-          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v4
-
-      - name: Install system dependencies
-        run: |
-          apt-get -y install `cat packages.txt`
-          apt-get -y install gcc g++ cmake libnuma-dev
-
-      - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v4
-        with:
-          repository: vllm-project/vllm
-          ref: v0.8.3
-          path: ./vllm-empty
-
-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
-        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
-
-      - name: Install vllm-project/vllm-ascend
-        run: |
-          pip install -r requirements-dev.txt
-          pip install -e .
-
-      - name: Install pta
-        run: |
-          if [ ! -d /root/.cache/pta ]; then
-            mkdir -p /root/.cache/pta
-          fi
-
-          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
-            cd /root/.cache/pta
-            rm -rf pytorch_v2.5.1_py310*
-            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
-            tar -zxvf pytorch_v2.5.1_py310.tar.gz
-          fi
-
-          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
-
-      - name: Run vllm-project/vllm-ascend test on V0 engine
-        env:
-          VLLM_USE_V1: 0
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests
-
-      - name: Run vllm-project/vllm-ascend test for V1 Engine
-        env:
-          VLLM_USE_V1: 1
-          VLLM_WORKER_MULTIPROC_METHOD: spawn
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          pytest -sv -m 'not multinpu' tests
-
-      - name: Run vllm-project/vllm test for V0 Engine
-        env:
-          VLLM_USE_V1: 0
-          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          pytest -sv
-
-  test-multinpu-v0_8_3:
-    name: vLLM Ascend test v0.8.3(multi-npu)
-    runs-on: linux-arm64-npu-4
-    needs: test-multinpu
-    container:
-      image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10
-      env:
-        HF_ENDPOINT: https://hf-mirror.com
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-    steps:
-      - name: Check npu and CANN info
-        run: |
-          npu-smi info
-          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
-
-      - name: Config mirrors
-        run: |
-          # sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
-          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-      - name: Install system dependencies
-        run: |
-          apt-get update -y
-          apt-get -y install git wget
-
-      - name: Config git
-        run: |
-          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ 
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v4
-
-      - name: Install dependencies 
-        run: |
-          pip install -r requirements-dev.txt
-
-      - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v4
-        with:
-          repository: vllm-project/vllm
-          ref: v0.8.3
-          path: ./vllm-empty
-
-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
-        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
-
-      - name: Install vllm-project/vllm-ascend
-        run: |
-          pip install -r requirements-dev.txt
-          pip install -e .
-
-      - name: Install pta
-        run: |
-          if [ ! -d /root/.cache/pta ]; then
-            mkdir -p /root/.cache/pta
-          fi
-
-          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
-            cd /root/.cache/pta
-            rm -rf pytorch_v2.5.1_py310*
-            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
-            tar -zxvf pytorch_v2.5.1_py310.tar.gz
-          fi
-
-          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
-      - name: Run vllm-project/vllm-ascend test on V0 engine
-        env:
-          VLLM_USE_V1: 0
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests
-
-      - name: Run vllm-project/vllm-ascend test for V1 Engine
-        env:
-          VLLM_USE_V1: 1
-          VLLM_WORKER_MULTIPROC_METHOD: spawn
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          pytest -sv -m 'multinpu' tests
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,7 @@ set(VLLM_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}")
 find_package(Torch REQUIRED)

 set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION ${SOC_VERSION})
 message(STATUS "Detected SOC version: ${SOC_VERSION}")

 if (NOT CMAKE_BUILD_TYPE)
@@ -49,10 +50,6 @@ ascendc_library(vllm_ascend_kernels SHARED
    ${KERNEL_FILES}
 )

-execute_process(COMMAND python3 -c "import os; import torch_npu; print(os.path.dirname(torch_npu.__file__))"
-  OUTPUT_STRIP_TRAILING_WHITESPACE
-  OUTPUT_VARIABLE TORCH_NPU_PATH
-)
 message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")

 file(GLOB VLLM_ASCEND_SRC
--- a/14
+++ b/14
@@ -18,12 +18,14 @@
 FROM quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1

 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}

 RUN apt-get update -y && \
-    apt-get install -y python3-pip git vim wget net-tools && \
+    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
    rm -rf /var/cache/apt/* && \
    rm -rf /var/lib/apt/lists/*

@@ -41,12 +43,16 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/ --extra-i
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN python3 -m pip uninstall -y triton

-# Install vllm-ascend
-RUN python3 -m pip install /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
-
 # Install torch-npu
 RUN bash /workspace/vllm-ascend/pta_install.sh

+# Install vllm-ascend
+RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib:$LD_LIBRARY_PATH && \
+    export LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:$LIBRARY_PATH && \
+    python3 -m pip install -v /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
+
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope ray

--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -17,11 +17,18 @@

 FROM quay.io/ascend/cann:8.0.0-910b-openeuler22.03-py3.10

+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
 RUN yum update -y && \
    yum install -y python3-pip git vim wget net-tools && \
    rm -rf /var/cache/yum &&\
    rm -rf /tmp/*

+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
 WORKDIR /workspace

 COPY . /workspace/vllm-ascend/  
@@ -35,12 +42,16 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/ --extra-i
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN python3 -m pip uninstall -y triton

-# Install vllm-ascend
-RUN python3 -m pip install /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
-
 # Install torch-npu
 RUN bash /workspace/vllm-ascend/pta_install.sh

+# Install vllm-ascend
+RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib:$LD_LIBRARY_PATH && \
+    export LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:$LIBRARY_PATH && \
+    python3 -m pip install -v /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
+
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope ray

--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -9,7 +9,7 @@
 ## Setup environment using container

 :::::{tab-set}
-::::{tab-item} Ubuntu OS
+::::{tab-item} Ubuntu

 ```{code-block} bash
   :substitutions:
@@ -35,7 +35,7 @@ docker run --rm \
 ```
 ::::

-::::{tab-item} openEuler OS
+::::{tab-item} openEuler

 ```{code-block} bash
   :substitutions:
--- a/docs/source/tutorials/multi_node.md
+++ b/docs/source/tutorials/multi_node.md
@@ -166,7 +166,7 @@ python -m vllm.entrypoints.openai.api_server \
 ```

 :::{note}
-If you're running DeepSeek V3/R1, please remove `quantization_config` section in `config.json` file since it's not supported by vllm-ascend currentlly.
+If you're running DeepSeek V3/R1, please remove `quantization_config` section in `config.json` file since it's not supported by vllm-ascend currently.
 :::

 Once your server is started, you can query the model with input prompts:
--- a/docs/source/user_guide/release_notes.md
+++ b/docs/source/user_guide/release_notes.md
@@ -7,7 +7,7 @@ This is 2nd release candidate of v0.7.3 for vllm-ascend. Please follow the [offi
 - Installation: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/installation.html

 ### Highlights
- Add Ascend Custom Ops framewrok. Developers now can write customs ops using AscendC. An example ops `rotary_embedding` is added. More tutorials will come soon. The Custome Ops complation is disabled by default when installing vllm-ascend. Set `COMPILE_CUSTOM_KERNELS=1` to enable it.  [#371](https://github.com/vllm-project/vllm-ascend/pull/371)
+- Add Ascend Custom Ops framewrok. Developers now can write customs ops using AscendC. An example ops `rotary_embedding` is added. More tutorials will come soon. The Custom Ops compilation is disabled by default when installing vllm-ascend. Set `COMPILE_CUSTOM_KERNELS=1` to enable it.  [#371](https://github.com/vllm-project/vllm-ascend/pull/371)
 - V1 engine is basic supported in this release. The full support will be done in 0.8.X release. If you hit any issue or have any requirement of V1 engine. Please tell us [here](https://github.com/vllm-project/vllm-ascend/issues/414). [#376](https://github.com/vllm-project/vllm-ascend/pull/376)
 - Prefix cache feature works now. You can set `enable_prefix_caching=True` to enable it. [#282](https://github.com/vllm-project/vllm-ascend/pull/282)

--- a/format.sh
+++ b/format.sh
@@ -144,7 +144,7 @@ CODESPELL_EXCLUDES=(
 )

 CODESPELL_IGNORE_WORDS=(
-    '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend'
+    '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue'
 )

 # check spelling of specified files
--- a/pta_install.sh
+++ b/pta_install.sh
@@ -7,9 +7,9 @@ tar -zxvf pytorch_v2.5.1_py310.tar.gz

 if [ "$(uname -i)" == "aarch64" ]
 then
-    pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
+    python3 -m pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
 else
-    pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --extra-index https://download.pytorch.org/whl/cpu/
+    python3 -m pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --extra-index https://download.pytorch.org/whl/cpu/
 fi

 cd ..
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,12 +4,13 @@ requires = [
    "cmake>=3.26",
    "decorator",
    "numpy<2.0.0",
+    "pip",
    "pybind11",
    "pyyaml",
    "scipy",
    "setuptools>=64",
    "setuptools-scm>=8",
-    "torch_npu >= 2.5.1rc1",
+    "torch_npu",
    "torch >= 2.5.1",
    "torchvision<0.21.0",
 ]
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,8 +1,5 @@
 [pytest]
 minversion = 6.0
-markers =
-    singlenpu: tests that run on single npu
-    multinpu: tests that run on multi npu
 norecursedirs = 
    vllm-empty/tests/prefix_caching
    vllm-empty/tests/weight_loading
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,6 +7,6 @@ pyyaml
 scipy
 setuptools>=64
 setuptools-scm>=8
-torch_npu >= 2.5.1rc1
+torch_npu
 torch >= 2.5.1
 torchvision<0.21.0
--- a/setup.py
+++ b/setup.py
@@ -153,23 +153,6 @@ class cmake_build_ext(build_ext):
            # else specify pybind11 path installed from source code on CI container
            raise RuntimeError(f"CMake configuration failed: {e}")

-        # try retrive soc version from npu-smi
-        soc_command = [
-            "bash",
-            "-c",
-            "npu-smi info | grep OK | awk '{print $3}' | head -n 1",
-        ]
-        try:
-            soc_version = subprocess.check_output(soc_command,
-                                                  text=True).strip()
-            soc_version = soc_version.split("-")[0]
-            soc_version = "Ascend" + soc_version
-        except subprocess.CalledProcessError as e:
-            raise RuntimeError(f"Retrive Soc version failed: {e}")
-
-        # add SOC_VERSION
-        cmake_args += [f"-DSOC_VERSION={soc_version}"]
-
        install_path = os.path.join(ROOT_DIR, self.build_lib)
        if isinstance(self.distribution.get_command_obj("develop"), develop):
            install_path = os.path.join(ROOT_DIR, "vllm_ascend")
@@ -178,6 +161,8 @@ class cmake_build_ext(build_ext):

        cmake_args += [f"-DCMAKE_PREFIX_PATH={pybind11_cmake_path}"]

+        cmake_args += [f"-DSOC_VERSION={envs.SOC_VERSION}"]
+
        # Override the base directory for FetchContent downloads to $ROOT/.deps
        # This allows sharing dependencies between profiles,
        # and plays more nicely with sccache.
@@ -186,6 +171,17 @@ class cmake_build_ext(build_ext):
        fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
        cmake_args += ["-DFETCHCONTENT_BASE_DIR={}".format(fc_base_dir)]

+        torch_npu_command = "python3 -m pip show torch-npu | grep '^Location:' | awk '{print $2}'"
+        try:
+            torch_npu_path = subprocess.check_output(
+                torch_npu_command, shell=True).decode().strip()
+            torch_npu_path += "/torch_npu"
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"Retrieve torch version version failed: {e}")
+
+        # add TORCH_NPU_PATH
+        cmake_args += [f"-DTORCH_NPU_PATH={torch_npu_path}"]
+
        build_tool = []
        # TODO(ganyi): ninja and ccache support for ascend c auto codegen. now we can only use make build
        # if which('ninja') is not None:
@@ -205,7 +201,7 @@ class cmake_build_ext(build_ext):
        )

    def build_extensions(self) -> None:
-        if envs.COMPILE_CUSTOM_KERNELS is None:
+        if not envs.COMPILE_CUSTOM_KERNELS:
            return
        # Ensure that CMake is present and working
        try:
@@ -285,7 +281,7 @@ except LookupError:
    VERSION = "0.0.0"

 ext_modules = []
-if envs.COMPILE_CUSTOM_KERNELS is not None:
+if envs.COMPILE_CUSTOM_KERNELS:
    ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")]


--- a/tests/multicard/test_offline_inference_distributed.py
+++ b/tests/multicard/test_offline_inference_distributed.py
@@ -0,0 +1,55 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/test_offline_inference.py`.
+"""
+import os
+
+import pytest
+import vllm  # noqa: F401
+from conftest import VllmRunner
+
+os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
+
+
+@pytest.mark.parametrize("model, distributed_executor_backend", [
+    ("Qwen/QwQ-32B", "mp"),
+])
+def test_models_distributed(model: str,
+                            distributed_executor_backend: str) -> None:
+    example_prompts = [
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
+        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
+        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
+    ]
+    dtype = "half"
+    max_tokens = 5
+    with VllmRunner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=4,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__])
--- a/tests/singlecard/test_offline_inference.py
+++ b/tests/singlecard/test_offline_inference.py
@@ -53,28 +53,6 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
        vllm_model.generate_greedy(example_prompts, max_tokens)


-@pytest.mark.multinpu
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("Qwen/QwQ-32B", "mp"),
-])
-def test_models_distributed(vllm_runner, model: str,
-                            distributed_executor_backend: str) -> None:
-    example_prompts = [
-        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
-        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
-        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
-    ]
-    dtype = "half"
-    max_tokens = 5
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=4,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_model.generate_greedy(example_prompts, max_tokens)
-
-
 if __name__ == "__main__":
    import pytest
    pytest.main([__file__])
--- a/tools/actionlint.sh
+++ b/tools/actionlint.sh
@@ -20,14 +20,13 @@
 #

 if command -v actionlint &> /dev/null; then
-    # NOTE: avoid check .github/workflows/vllm_ascend_test.yaml because sel-hosted runner `npu-arm64` is unknown
-    actionlint .github/workflows/*.yml .github/workflows/mypy.yaml
+    actionlint .github/workflows/*.yml .github/workflows/*.yaml
    exit 0
 elif [ -x ./actionlint ]; then
-    ./actionlint .github/workflows/*.yml .github/workflows/mypy.yaml
+    ./actionlint .github/workflows/*.yml .github/workflows/*.yaml
    exit 0
 fi

 # download a binary to the current directory - v1.7.3
 bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
-./actionlint  .github/workflows/*.yml .github/workflows/mypy.yaml
+./actionlint  .github/workflows/*.yml .github/workflows/*.yaml
--- a/tools/shellcheck.sh
+++ b/tools/shellcheck.sh
@@ -39,6 +39,3 @@ if ! [ -x "$(command -v shellcheck)" ]; then
    PATH="$PATH:$(pwd)/shellcheck-${scversion}"
    export PATH
 fi
-
-# TODO - fix warnings in .buildkite/run-amd-test.sh
-find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"'
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -3,14 +3,21 @@ from typing import Any, Callable, Dict

 env_variables: Dict[str, Callable[[], Any]] = {
    # max compile thread num
-    "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
-    "CMAKE_BUILD_TYPE": lambda: os.getenv("CMAKE_BUILD_TYPE"),
+    "MAX_JOBS":
+    lambda: os.getenv("MAX_JOBS", None),
+    "CMAKE_BUILD_TYPE":
+    lambda: os.getenv("CMAKE_BUILD_TYPE"),
    "COMPILE_CUSTOM_KERNELS":
-    lambda: os.getenv("COMPILE_CUSTOM_KERNELS", None),
-    # If set, vllm-ascend will print verbose logs during compliation
-    "VERBOSE": lambda: bool(int(os.getenv('VERBOSE', '0'))),
-    "ASCEND_HOME_PATH": lambda: os.getenv("ASCEND_HOME_PATH", None),
-    "LD_LIBRARY_PATH": lambda: os.getenv("LD_LIBRARY_PATH", None),
+    lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
+    "SOC_VERSION":
+    lambda: os.getenv("SOC_VERSION", "ASCEND910B1"),
+    # If set, vllm-ascend will print verbose logs during compilation
+    "VERBOSE":
+    lambda: bool(int(os.getenv('VERBOSE', '0'))),
+    "ASCEND_HOME_PATH":
+    lambda: os.getenv("ASCEND_HOME_PATH", None),
+    "LD_LIBRARY_PATH":
+    lambda: os.getenv("LD_LIBRARY_PATH", None),
 }