From 9c7428b3d5b63939c15ae713edc3871e51b98cbc Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Sat, 12 Apr 2025 10:24:53 +0800
Subject: [PATCH] [CI] enable custom ops build (#466)

### What this PR does / why we need it?
This PR enable custom ops build  by default.

### Does this PR introduce _any_ user-facing change?

Yes, users now install vllm-ascend from source will trigger custom ops
build step.

### How was this patch tested?
By image build and e2e CI

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 .github/actionlint.yaml                       |   5 +
 .github/workflows/actionlint.yml              |   2 +
 .github/workflows/image_openeuler.yml         |   7 +-
 .../workflows/{image.yml => image_ubuntu.yml} |  11 +-
 .github/workflows/vllm_ascend_test.yaml       | 282 ++----------------
 CMakeLists.txt                                |   5 +-
 Dockerfile                                    |  14 +-
 Dockerfile.openEuler                          |  17 +-
 docs/source/quick_start.md                    |   4 +-
 docs/source/tutorials/multi_node.md           |   2 +-
 docs/source/user_guide/release_notes.md       |   2 +-
 format.sh                                     |   2 +-
 pta_install.sh                                |   4 +-
 pyproject.toml                                |   3 +-
 pytest.ini                                    |   3 -
 requirements.txt                              |   2 +-
 setup.py                                      |  34 +--
 .../test_offline_inference_distributed.py     |  55 ++++
 .../test_offline_inference.py                 |  22 --
 tools/actionlint.sh                           |   7 +-
 tools/shellcheck.sh                           |   3 -
 vllm_ascend/envs.py                           |  21 +-
 22 files changed, 165 insertions(+), 342 deletions(-)
 create mode 100644 .github/actionlint.yaml
 rename .github/workflows/{image.yml => image_ubuntu.yml} (90%)
 create mode 100644 tests/multicard/test_offline_inference_distributed.py
 rename tests/{ => singlecard}/test_offline_inference.py (68%)

diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
new file mode 100644
index 0000000..972abb3
--- /dev/null
+++ b/.github/actionlint.yaml
@@ -0,0 +1,5 @@
+self-hosted-runner:
+  # Labels of self-hosted runner in array of strings.
+  labels:
+    - linux-arm64-npu-1
+    - linux-arm64-npu-4
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index 98b2146..294b814 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -46,6 +46,8 @@ jobs:
           fetch-depth: 0
 
       - name: "Run actionlint"
+        env:
+          SHELLCHECK_OPTS: --exclude=SC2046,SC2006
         run: |
           echo "::add-matcher::.github/workflows/matchers/actionlint.json"
           tools/actionlint.sh -color
diff --git a/.github/workflows/image_openeuler.yml b/.github/workflows/image_openeuler.yml
index ed2baf3..3e074d1 100644
--- a/.github/workflows/image_openeuler.yml
+++ b/.github/workflows/image_openeuler.yml
@@ -72,9 +72,6 @@ jobs:
 
     - name: Build - Set up QEMU
       uses: docker/setup-qemu-action@v3
-      # TODO(yikun): remove this after https://github.com/docker/setup-qemu-action/issues/198 resolved
-      with:
-        image: tonistiigi/binfmt:qemu-v7.0.0-28
 
     - name: Build - Set up Docker Buildx
       uses: docker/setup-buildx-action@v3
@@ -98,3 +95,7 @@ jobs:
         labels: ${{ steps.meta.outputs.labels }}
         tags: ${{ steps.meta.outputs.tags }}
         file: Dockerfile.openEuler
+        # TODO: support and enable custom ops build for openEuler
+        build-args: |
+          PIP_INDEX_URL=https://pypi.org/simple
+          COMPILE_CUSTOM_KERNELS=0
diff --git a/.github/workflows/image.yml b/.github/workflows/image_ubuntu.yml
similarity index 90%
rename from .github/workflows/image.yml
rename to .github/workflows/image_ubuntu.yml
index 100471a..03fc316 100644
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image_ubuntu.yml
@@ -16,7 +16,7 @@ on:
       - 'main'
       - '*-dev'
     paths:
-      - '.github/workflows/image.yml'
+      - '.github/workflows/image_ubuntu.yml'
       - 'Dockerfile'
       - 'vllm_ascend/**'
   push:
@@ -27,13 +27,13 @@ on:
     tags:
       - 'v*'
     paths:
-      - '.github/workflows/image.yml'
+      - '.github/workflows/image_ubuntu.yml'
       - 'Dockerfile'
       - 'vllm_ascend/**'
 jobs:
 
   build:
-    name: vllm-ascend image
+    name: vllm-ascend Ubuntu image
     runs-on: ubuntu-latest
 
     steps:
@@ -72,9 +72,6 @@ jobs:
 
     - name: Build - Set up QEMU
       uses: docker/setup-qemu-action@v3
-      # TODO(yikun): remove this after https://github.com/docker/setup-qemu-action/issues/198 resolved
-      with:
-        image: tonistiigi/binfmt:qemu-v7.0.0-28
 
     - name: Build - Set up Docker Buildx
       uses: docker/setup-buildx-action@v3
@@ -98,4 +95,4 @@ jobs:
         labels: ${{ steps.meta.outputs.labels }}
         tags: ${{ steps.meta.outputs.tags }}
         build-args: |
-            PIP_INDEX_URL=https://pypi.org/simple
+          PIP_INDEX_URL=https://pypi.org/simple
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index e69eb82..3042c47 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -41,9 +41,14 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  test-singlenpu:
-    name: vLLM Ascend test main(single-npu)
-    runs-on: linux-arm64-npu-1  # actionlint-ignore: runner-label
+  test:
+    strategy:
+      max-parallel: 2
+      matrix:
+        os: [linux-arm64-npu-1, linux-arm64-npu-4]
+        vllm_verison: [main, v0.8.3]
+    name: vLLM Ascend test
+    runs-on: ${{ matrix.os }}
     container:
       image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
     steps:
@@ -72,6 +77,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: vllm-project/vllm
+          ref: ${{ matrix.vllm_verison }}
           path: ./vllm-empty
 
       - name: Install vllm-project/vllm from source
@@ -79,11 +85,6 @@ jobs:
         run: |
           VLLM_TARGET_DEVICE=empty pip install -e .
 
-      - name: Install vllm-project/vllm-ascend
-        run: |
-          pip install -r requirements-dev.txt
-          pip install -e .
-
       - name: Install pta
         run: |
           if [ ! -d /root/.cache/pta ]; then
@@ -99,12 +100,23 @@ jobs:
 
           pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
 
+      - name: Install vllm-project/vllm-ascend
+        run: |
+          pip install -r requirements-dev.txt
+          pip install -e .
+
       - name: Run vllm-project/vllm-ascend test on V0 engine
         env:
           VLLM_USE_V1: 0
           HF_ENDPOINT: https://hf-mirror.com
         run: |
-          VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests
+          if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
+            pytest -sv tests/singlecard
+            pytest -sv tests/ops
+          else
+            pytest -sv tests/multicard
+            pytest -sv tests/ops
+          fi
 
       - name: Run vllm-project/vllm-ascend test for V1 Engine
         env:
@@ -112,7 +124,13 @@ jobs:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           HF_ENDPOINT: https://hf-mirror.com
         run: |
-          pytest -sv -m 'not multinpu' tests
+          if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
+            pytest -sv tests/singlecard
+            pytest -sv tests/ops
+          else
+            pytest -sv tests/multicard
+            pytest -sv tests/ops
+          fi
 
       - name: Run vllm-project/vllm test for V0 Engine
         env:
@@ -121,247 +139,3 @@ jobs:
           HF_ENDPOINT: https://hf-mirror.com
         run: |
           pytest -sv
-
-  test-multinpu:
-    name: vLLM Ascend test main(multi-npu)
-    runs-on: linux-arm64-npu-4  
-    container:
-      image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10
-      env:
-        HF_ENDPOINT: https://hf-mirror.com
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-    steps:
-      - name: Check npu and CANN info
-        run: |
-          npu-smi info
-          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
-
-      - name: Config mirrors
-        run: |
-          # sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
-          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-      - name: Install system dependencies
-        run: |
-          apt-get update -y
-          apt-get -y install git wget
-
-      - name: Config git
-        run: |
-          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ 
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v4
-
-      - name: Install dependencies
-        run: |
-          pip install -r requirements-dev.txt
-
-      - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v4
-        with:
-          repository: vllm-project/vllm
-          path: ./vllm-empty
-
-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
-        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
-
-      - name: Install vllm-project/vllm-ascend
-        run: |
-          pip install -r requirements-dev.txt
-          pip install -e .
-
-      - name: Install pta
-        run: |
-          if [ ! -d /root/.cache/pta ]; then
-            mkdir -p /root/.cache/pta
-          fi
-
-          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
-            cd /root/.cache/pta
-            rm -rf pytorch_v2.5.1_py310*
-            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
-            tar -zxvf pytorch_v2.5.1_py310.tar.gz
-          fi
-
-          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
-      - name: Run vllm-project/vllm-ascend test on V0 engine
-        env:
-          VLLM_USE_V1: 0
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests
-
-      - name: Run vllm-project/vllm-ascend test for V1 Engine
-        env:
-          VLLM_USE_V1: 1
-          VLLM_WORKER_MULTIPROC_METHOD: spawn
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          pytest -sv -m 'multinpu' tests
-
-  test-singlenpu-v0_8_3:
-    name: vLLM Ascend test v0.8.3(single-npu)
-    runs-on: linux-arm64-npu-1  # actionlint-ignore: runner-label
-    container:
-      image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
-    steps:
-      - name: Check npu and CANN info
-        run: |
-          npu-smi info
-          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
-
-      - name: Config mirrors
-        run: |
-          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
-          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-          apt-get update -y
-          apt install git -y
-          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v4
-
-      - name: Install system dependencies
-        run: |
-          apt-get -y install `cat packages.txt`
-          apt-get -y install gcc g++ cmake libnuma-dev
-
-      - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v4
-        with:
-          repository: vllm-project/vllm
-          ref: v0.8.3
-          path: ./vllm-empty
-
-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
-        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
-
-      - name: Install vllm-project/vllm-ascend
-        run: |
-          pip install -r requirements-dev.txt
-          pip install -e .
-
-      - name: Install pta
-        run: |
-          if [ ! -d /root/.cache/pta ]; then
-            mkdir -p /root/.cache/pta
-          fi
-
-          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
-            cd /root/.cache/pta
-            rm -rf pytorch_v2.5.1_py310*
-            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
-            tar -zxvf pytorch_v2.5.1_py310.tar.gz
-          fi
-
-          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
-
-      - name: Run vllm-project/vllm-ascend test on V0 engine
-        env:
-          VLLM_USE_V1: 0
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests
-
-      - name: Run vllm-project/vllm-ascend test for V1 Engine
-        env:
-          VLLM_USE_V1: 1
-          VLLM_WORKER_MULTIPROC_METHOD: spawn
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          pytest -sv -m 'not multinpu' tests
-
-      - name: Run vllm-project/vllm test for V0 Engine
-        env:
-          VLLM_USE_V1: 0
-          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          pytest -sv
-
-  test-multinpu-v0_8_3:
-    name: vLLM Ascend test v0.8.3(multi-npu)
-    runs-on: linux-arm64-npu-4
-    needs: test-multinpu
-    container:
-      image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10
-      env:
-        HF_ENDPOINT: https://hf-mirror.com
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-    steps:
-      - name: Check npu and CANN info
-        run: |
-          npu-smi info
-          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
-
-      - name: Config mirrors
-        run: |
-          # sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
-          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-      - name: Install system dependencies
-        run: |
-          apt-get update -y
-          apt-get -y install git wget
-
-      - name: Config git
-        run: |
-          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ 
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v4
-
-      - name: Install dependencies 
-        run: |
-          pip install -r requirements-dev.txt
-
-      - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v4
-        with:
-          repository: vllm-project/vllm
-          ref: v0.8.3
-          path: ./vllm-empty
-
-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
-        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
-
-      - name: Install vllm-project/vllm-ascend
-        run: |
-          pip install -r requirements-dev.txt
-          pip install -e .
-
-      - name: Install pta
-        run: |
-          if [ ! -d /root/.cache/pta ]; then
-            mkdir -p /root/.cache/pta
-          fi
-
-          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
-            cd /root/.cache/pta
-            rm -rf pytorch_v2.5.1_py310*
-            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
-            tar -zxvf pytorch_v2.5.1_py310.tar.gz
-          fi
-
-          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
-      - name: Run vllm-project/vllm-ascend test on V0 engine
-        env:
-          VLLM_USE_V1: 0
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests
-
-      - name: Run vllm-project/vllm-ascend test for V1 Engine
-        env:
-          VLLM_USE_V1: 1
-          VLLM_WORKER_MULTIPROC_METHOD: spawn
-          HF_ENDPOINT: https://hf-mirror.com
-        run: |
-          pytest -sv -m 'multinpu' tests
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1814e4c..682b934 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,7 @@ set(VLLM_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}")
 find_package(Torch REQUIRED)
 
 set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION ${SOC_VERSION})
 message(STATUS "Detected SOC version: ${SOC_VERSION}")
 
 if (NOT CMAKE_BUILD_TYPE)
@@ -49,10 +50,6 @@ ascendc_library(vllm_ascend_kernels SHARED
     ${KERNEL_FILES}
 )
 
-execute_process(COMMAND python3 -c "import os; import torch_npu; print(os.path.dirname(torch_npu.__file__))"
-  OUTPUT_STRIP_TRAILING_WHITESPACE
-  OUTPUT_VARIABLE TORCH_NPU_PATH
-)
 message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
 
 file(GLOB VLLM_ASCEND_SRC
diff --git a/Dockerfile b/Dockerfile
index 2a0f93d..ad4d51e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,12 +18,14 @@
 FROM quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
 
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
 
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 
 RUN apt-get update -y && \
-    apt-get install -y python3-pip git vim wget net-tools && \
+    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
     rm -rf /var/cache/apt/* && \
     rm -rf /var/lib/apt/lists/*
 
@@ -41,12 +43,16 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/ --extra-i
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN python3 -m pip uninstall -y triton
 
-# Install vllm-ascend
-RUN python3 -m pip install /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
-
 # Install torch-npu
 RUN bash /workspace/vllm-ascend/pta_install.sh
 
+# Install vllm-ascend
+RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib:$LD_LIBRARY_PATH && \
+    export LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:$LIBRARY_PATH && \
+    python3 -m pip install -v /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
+
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope ray
 
diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler
index 1b3bfa3..967b666 100644
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -17,11 +17,18 @@
 
 FROM quay.io/ascend/cann:8.0.0-910b-openeuler22.03-py3.10
 
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
 RUN yum update -y && \
     yum install -y python3-pip git vim wget net-tools && \
     rm -rf /var/cache/yum &&\
     rm -rf /tmp/*
 
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
 WORKDIR /workspace
 
 COPY . /workspace/vllm-ascend/  
@@ -35,12 +42,16 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/ --extra-i
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN python3 -m pip uninstall -y triton
 
-# Install vllm-ascend
-RUN python3 -m pip install /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
-
 # Install torch-npu
 RUN bash /workspace/vllm-ascend/pta_install.sh
 
+# Install vllm-ascend
+RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib:$LD_LIBRARY_PATH && \
+    export LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:$LIBRARY_PATH && \
+    python3 -m pip install -v /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
+
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope ray
 
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
index 265d68e..76edd38 100644
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -9,7 +9,7 @@
 ## Setup environment using container
 
 :::::{tab-set}
-::::{tab-item} Ubuntu OS
+::::{tab-item} Ubuntu
 
 ```{code-block} bash
    :substitutions:
@@ -35,7 +35,7 @@ docker run --rm \
 ```
 ::::
 
-::::{tab-item} openEuler OS
+::::{tab-item} openEuler
 
 ```{code-block} bash
    :substitutions:
diff --git a/docs/source/tutorials/multi_node.md b/docs/source/tutorials/multi_node.md
index fa367d7..35c8b38 100644
--- a/docs/source/tutorials/multi_node.md
+++ b/docs/source/tutorials/multi_node.md
@@ -166,7 +166,7 @@ python -m vllm.entrypoints.openai.api_server \
 ```
 
 :::{note}
-If you're running DeepSeek V3/R1, please remove `quantization_config` section in `config.json` file since it's not supported by vllm-ascend currentlly.
+If you're running DeepSeek V3/R1, please remove `quantization_config` section in `config.json` file since it's not supported by vllm-ascend currently.
 :::
 
 Once your server is started, you can query the model with input prompts:
diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md
index fa04741..5504820 100644
--- a/docs/source/user_guide/release_notes.md
+++ b/docs/source/user_guide/release_notes.md
@@ -7,7 +7,7 @@ This is 2nd release candidate of v0.7.3 for vllm-ascend. Please follow the [offi
 - Installation: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/installation.html
 
 ### Highlights
-- Add Ascend Custom Ops framewrok. Developers now can write customs ops using AscendC. An example ops `rotary_embedding` is added. More tutorials will come soon. The Custome Ops complation is disabled by default when installing vllm-ascend. Set `COMPILE_CUSTOM_KERNELS=1` to enable it.  [#371](https://github.com/vllm-project/vllm-ascend/pull/371)
+- Add Ascend Custom Ops framewrok. Developers now can write customs ops using AscendC. An example ops `rotary_embedding` is added. More tutorials will come soon. The Custom Ops compilation is disabled by default when installing vllm-ascend. Set `COMPILE_CUSTOM_KERNELS=1` to enable it.  [#371](https://github.com/vllm-project/vllm-ascend/pull/371)
 - V1 engine is basic supported in this release. The full support will be done in 0.8.X release. If you hit any issue or have any requirement of V1 engine. Please tell us [here](https://github.com/vllm-project/vllm-ascend/issues/414). [#376](https://github.com/vllm-project/vllm-ascend/pull/376)
 - Prefix cache feature works now. You can set `enable_prefix_caching=True` to enable it. [#282](https://github.com/vllm-project/vllm-ascend/pull/282)
 
diff --git a/format.sh b/format.sh
index 1d0b940..32202aa 100755
--- a/format.sh
+++ b/format.sh
@@ -144,7 +144,7 @@ CODESPELL_EXCLUDES=(
 )
 
 CODESPELL_IGNORE_WORDS=(
-    '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend'
+    '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue'
 )
 
 # check spelling of specified files
diff --git a/pta_install.sh b/pta_install.sh
index 64c1b01..d72512c 100755
--- a/pta_install.sh
+++ b/pta_install.sh
@@ -7,9 +7,9 @@ tar -zxvf pytorch_v2.5.1_py310.tar.gz
 
 if [ "$(uname -i)" == "aarch64" ]
 then
-    pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
+    python3 -m pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
 else
-    pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --extra-index https://download.pytorch.org/whl/cpu/
+    python3 -m pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --extra-index https://download.pytorch.org/whl/cpu/
 fi
 
 cd ..
diff --git a/pyproject.toml b/pyproject.toml
index c73b9b3..f8855c4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,12 +4,13 @@ requires = [
     "cmake>=3.26",
     "decorator",
     "numpy<2.0.0",
+    "pip",
     "pybind11",
     "pyyaml",
     "scipy",
     "setuptools>=64",
     "setuptools-scm>=8",
-    "torch_npu >= 2.5.1rc1",
+    "torch_npu",
     "torch >= 2.5.1",
     "torchvision<0.21.0",
 ]
diff --git a/pytest.ini b/pytest.ini
index e2c9818..8889df7 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,8 +1,5 @@
 [pytest]
 minversion = 6.0
-markers =
-    singlenpu: tests that run on single npu
-    multinpu: tests that run on multi npu
 norecursedirs = 
     vllm-empty/tests/prefix_caching
     vllm-empty/tests/weight_loading
diff --git a/requirements.txt b/requirements.txt
index 3f3c0b0..e20b03e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,6 +7,6 @@ pyyaml
 scipy
 setuptools>=64
 setuptools-scm>=8
-torch_npu >= 2.5.1rc1
+torch_npu
 torch >= 2.5.1
 torchvision<0.21.0
diff --git a/setup.py b/setup.py
index 0bcfd88..912de3c 100644
--- a/setup.py
+++ b/setup.py
@@ -153,23 +153,6 @@ class cmake_build_ext(build_ext):
             # else specify pybind11 path installed from source code on CI container
             raise RuntimeError(f"CMake configuration failed: {e}")
 
-        # try retrive soc version from npu-smi
-        soc_command = [
-            "bash",
-            "-c",
-            "npu-smi info | grep OK | awk '{print $3}' | head -n 1",
-        ]
-        try:
-            soc_version = subprocess.check_output(soc_command,
-                                                  text=True).strip()
-            soc_version = soc_version.split("-")[0]
-            soc_version = "Ascend" + soc_version
-        except subprocess.CalledProcessError as e:
-            raise RuntimeError(f"Retrive Soc version failed: {e}")
-
-        # add SOC_VERSION
-        cmake_args += [f"-DSOC_VERSION={soc_version}"]
-
         install_path = os.path.join(ROOT_DIR, self.build_lib)
         if isinstance(self.distribution.get_command_obj("develop"), develop):
             install_path = os.path.join(ROOT_DIR, "vllm_ascend")
@@ -178,6 +161,8 @@ class cmake_build_ext(build_ext):
 
         cmake_args += [f"-DCMAKE_PREFIX_PATH={pybind11_cmake_path}"]
 
+        cmake_args += [f"-DSOC_VERSION={envs.SOC_VERSION}"]
+
         # Override the base directory for FetchContent downloads to $ROOT/.deps
         # This allows sharing dependencies between profiles,
         # and plays more nicely with sccache.
@@ -186,6 +171,17 @@ class cmake_build_ext(build_ext):
         fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
         cmake_args += ["-DFETCHCONTENT_BASE_DIR={}".format(fc_base_dir)]
 
+        torch_npu_command = "python3 -m pip show torch-npu | grep '^Location:' | awk '{print $2}'"
+        try:
+            torch_npu_path = subprocess.check_output(
+                torch_npu_command, shell=True).decode().strip()
+            torch_npu_path += "/torch_npu"
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"Retrieve torch version version failed: {e}")
+
+        # add TORCH_NPU_PATH
+        cmake_args += [f"-DTORCH_NPU_PATH={torch_npu_path}"]
+
         build_tool = []
         # TODO(ganyi): ninja and ccache support for ascend c auto codegen. now we can only use make build
         # if which('ninja') is not None:
@@ -205,7 +201,7 @@ class cmake_build_ext(build_ext):
         )
 
     def build_extensions(self) -> None:
-        if envs.COMPILE_CUSTOM_KERNELS is None:
+        if not envs.COMPILE_CUSTOM_KERNELS:
             return
         # Ensure that CMake is present and working
         try:
@@ -285,7 +281,7 @@ except LookupError:
     VERSION = "0.0.0"
 
 ext_modules = []
-if envs.COMPILE_CUSTOM_KERNELS is not None:
+if envs.COMPILE_CUSTOM_KERNELS:
     ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")]
 
 
diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py
new file mode 100644
index 0000000..1304001
--- /dev/null
+++ b/tests/multicard/test_offline_inference_distributed.py
@@ -0,0 +1,55 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/test_offline_inference.py`.
+"""
+import os
+
+import pytest
+import vllm  # noqa: F401
+from conftest import VllmRunner
+
+os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
+
+
+@pytest.mark.parametrize("model, distributed_executor_backend", [
+    ("Qwen/QwQ-32B", "mp"),
+])
+def test_models_distributed(model: str,
+                            distributed_executor_backend: str) -> None:
+    example_prompts = [
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
+        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
+        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
+    ]
+    dtype = "half"
+    max_tokens = 5
+    with VllmRunner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=4,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__])
diff --git a/tests/test_offline_inference.py b/tests/singlecard/test_offline_inference.py
similarity index 68%
rename from tests/test_offline_inference.py
rename to tests/singlecard/test_offline_inference.py
index ecff067..3c17605 100644
--- a/tests/test_offline_inference.py
+++ b/tests/singlecard/test_offline_inference.py
@@ -53,28 +53,6 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
-@pytest.mark.multinpu
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("Qwen/QwQ-32B", "mp"),
-])
-def test_models_distributed(vllm_runner, model: str,
-                            distributed_executor_backend: str) -> None:
-    example_prompts = [
-        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
-        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
-        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
-    ]
-    dtype = "half"
-    max_tokens = 5
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=4,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_model.generate_greedy(example_prompts, max_tokens)
-
-
 if __name__ == "__main__":
     import pytest
     pytest.main([__file__])
diff --git a/tools/actionlint.sh b/tools/actionlint.sh
index d97b4bb..482d866 100755
--- a/tools/actionlint.sh
+++ b/tools/actionlint.sh
@@ -20,14 +20,13 @@
 #
 
 if command -v actionlint &> /dev/null; then
-    # NOTE: avoid check .github/workflows/vllm_ascend_test.yaml because sel-hosted runner `npu-arm64` is unknown
-    actionlint .github/workflows/*.yml .github/workflows/mypy.yaml
+    actionlint .github/workflows/*.yml .github/workflows/*.yaml
     exit 0
 elif [ -x ./actionlint ]; then
-    ./actionlint .github/workflows/*.yml .github/workflows/mypy.yaml
+    ./actionlint .github/workflows/*.yml .github/workflows/*.yaml
     exit 0
 fi
 
 # download a binary to the current directory - v1.7.3
 bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
-./actionlint  .github/workflows/*.yml .github/workflows/mypy.yaml
+./actionlint  .github/workflows/*.yml .github/workflows/*.yaml
diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh
index d782af7..1194e65 100755
--- a/tools/shellcheck.sh
+++ b/tools/shellcheck.sh
@@ -39,6 +39,3 @@ if ! [ -x "$(command -v shellcheck)" ]; then
     PATH="$PATH:$(pwd)/shellcheck-${scversion}"
     export PATH
 fi
-
-# TODO - fix warnings in .buildkite/run-amd-test.sh
-find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"'
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
index 014bfd7..fb88c3b 100644
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -3,14 +3,21 @@ from typing import Any, Callable, Dict
 
 env_variables: Dict[str, Callable[[], Any]] = {
     # max compile thread num
-    "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
-    "CMAKE_BUILD_TYPE": lambda: os.getenv("CMAKE_BUILD_TYPE"),
+    "MAX_JOBS":
+    lambda: os.getenv("MAX_JOBS", None),
+    "CMAKE_BUILD_TYPE":
+    lambda: os.getenv("CMAKE_BUILD_TYPE"),
     "COMPILE_CUSTOM_KERNELS":
-    lambda: os.getenv("COMPILE_CUSTOM_KERNELS", None),
-    # If set, vllm-ascend will print verbose logs during compliation
-    "VERBOSE": lambda: bool(int(os.getenv('VERBOSE', '0'))),
-    "ASCEND_HOME_PATH": lambda: os.getenv("ASCEND_HOME_PATH", None),
-    "LD_LIBRARY_PATH": lambda: os.getenv("LD_LIBRARY_PATH", None),
+    lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
+    "SOC_VERSION":
+    lambda: os.getenv("SOC_VERSION", "ASCEND910B1"),
+    # If set, vllm-ascend will print verbose logs during compilation
+    "VERBOSE":
+    lambda: bool(int(os.getenv('VERBOSE', '0'))),
+    "ASCEND_HOME_PATH":
+    lambda: os.getenv("ASCEND_HOME_PATH", None),
+    "LD_LIBRARY_PATH":
+    lambda: os.getenv("LD_LIBRARY_PATH", None),
 }