diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml new file mode 100644 index 000000000..be924d7bb --- /dev/null +++ b/.github/workflows/pr-test-npu.yml @@ -0,0 +1,64 @@ +name: PR Test (Ascend NPU) + +on: + push: + branches: [ main ] + paths: + - "python/**" + - "scripts/**" + - "test/**" + - ".github/workflows/pr-test-npu.yml" + pull_request: + branches: [ main ] + paths: + - "python/**" + - "scripts/**" + - "test/**" + - ".github/workflows/pr-test-npu.yml" + workflow_dispatch: + +concurrency: + group: pr-test-npu-${{ github.ref }} + cancel-in-progress: true + +jobs: + unit-test-basic: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: linux-arm64-npu-1 + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1.alpha003-910b-ubuntu22.04-py3.11 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/npu_ci_install_dependency.sh + # copy required dataset file from our daily cache + cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp + + - name: Run test + timeout-minutes: 30 + env: + SGLANG_USE_MODELSCOPE: true + HF_ENDPOINT: https://hf-mirror.com + run: | + cd test/srt + python3 run_suite.py --suite per-commit-npu + finish: + if: always() + needs: [ unit-test-basic ] + runs-on: ubuntu-latest + steps: + - name: Check all dependent job statuses + run: | + results=(${{ join(needs.*.result, ' ') }}) + for result in "${results[@]}"; do + if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then + echo "Job failed with result: $result" + exit 1 + fi + done + echo "All jobs completed successfully" + exit 0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 89d4664c5..e9e9af1d0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,7 +38,7 @@ repos: hooks: - id: codespell additional_dependencies: ['tomli'] - args: ['--toml', 'python/pyproject.toml'] + args: ['--toml', 'python/pyproject.toml', '-L', 'cann'] exclude: test/srt/test_reasoning_parser.py # Exclude the test file that is expected to fail - repo: https://github.com/pre-commit/mirrors-clang-format rev: v18.1.8 diff --git a/scripts/npu_ci_install_dependency.sh b/scripts/npu_ci_install_dependency.sh new file mode 100755 index 000000000..ec3a162d5 --- /dev/null +++ b/scripts/npu_ci_install_dependency.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -euo pipefail + +# Install the required dependencies in CI. +sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list +apt update -y +apt install -y build-essential cmake python3-pip python3-dev wget net-tools zlib1g-dev lld clang software-properties-common + + +pip config set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple +python3 -m pip install --upgrade pip +pip uninstall sgl-kernel -y || true + + +### Download MemFabricV2 +MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl" +MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com:443/sglang/${MF_WHL_NAME}" +wget "${MEMFABRIC_URL}" && pip install "./${MF_WHL_NAME}" + + +### Install vLLM +VLLM_TAG=v0.8.5 +git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG +(cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v -e .) + + +### Install PyTorch and PTA +PYTORCH_VERSION=2.6.0 +TORCHVISION_VERSION=0.21.0 +PTA_VERSION=2.6.0rc1 +pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu +pip install torch_npu==$PTA_VERSION + + +### Install Triton-Ascend +TRITON_ASCEND_VERSION=3.2.0rc2 +pip install attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11 +pip install triton-ascend==$TRITON_ASCEND_VERSION + + +pip install -e "python[srt_npu]" + + +### Modify PyTorch TODO: to be removed later +TORCH_LOCATION=$(python3 -c 'import torch; print(torch.__path__[0])') +sed -i 's/from triton.runtime.autotuner import OutOfResources/from triton.runtime.errors import OutOfResources/' "${TORCH_LOCATION}/_inductor/runtime/triton_heuristics.py" diff --git a/test/srt/test_ascend_attention_backend.py b/test/srt/test_ascend_attention_backend.py index 4ca6bba8f..e406fee3c 100644 --- a/test/srt/test_ascend_attention_backend.py +++ b/test/srt/test_ascend_attention_backend.py @@ -20,22 +20,10 @@ from sglang.test.test_utils import ( run_bench_offline_throughput, ) +DEFAULT_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-7B-Instruct" + class TestAscendAttnBackend(CustomTestCase): - def test_latency(self): - output_throughput = run_bench_offline_throughput( - DEFAULT_MODEL_NAME_FOR_TEST, - [ - "--attention-backend", - "ascend", - ], - ) - - print(f"{output_throughput=}") - - if is_in_ci(): - self.assertGreater(output_throughput, 18) - def test_gsm8k(self): model = DEFAULT_MODEL_NAME_FOR_TEST base_url = DEFAULT_URL_FOR_TEST