[feat] add ascend readme and docker release (#8700)

Signed-off-by: mywaaagh_admin <pkwarcraft@gmail.com> Signed-off-by: lichaoran <pkwarcraft@gmail.com> Co-authored-by: Even Zhou <even.y.zhou@outlook.com> Co-authored-by: ronnie_zheng <zl19940307@163.com>
2025-08-13 04:25:42 +08:00
parent 305b27c124
commit 2ecbd8b8bf
7 changed files with 467 additions and 18 deletions
--- a/.github/workflows/pr-test-npu.yml
+++ b/.github/workflows/pr-test-npu.yml
@@ -27,13 +27,19 @@ jobs:
        github.event.pull_request.draft == false
    runs-on: linux-arm64-npu-1
    container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1.alpha003-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Install dependencies
        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.trusted-host ${CACHING_URL}
+
          bash scripts/ci/npu_ci_install_dependency.sh
          # copy required file from our daily cache
          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
@@ -56,13 +62,19 @@ jobs:
        github.event.pull_request.draft == false
    runs-on: linux-arm64-npu-2
    container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1.alpha003-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Install dependencies
        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.trusted-host ${CACHING_URL}
+
          bash scripts/ci/npu_ci_install_dependency.sh
          # copy required file from our daily cache
          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
@@ -85,13 +97,19 @@ jobs:
        github.event.pull_request.draft == false
    runs-on: linux-arm64-npu-4
    container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1.alpha003-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Install dependencies
        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.trusted-host ${CACHING_URL}
+
          bash scripts/ci/npu_ci_install_dependency.sh
          # copy required file from our daily cache
          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
--- a/.github/workflows/release-docker-npu-nightly.yaml
+++ b/.github/workflows/release-docker-npu-nightly.yaml
@@ -0,0 +1,76 @@
+name: Release Docker Images Nightly (Ascend NPU)
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - ".github/workflows/release-docker-npu-nightly.yaml"
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04-arm
+    strategy:
+      matrix:
+        cann_version: ["8.2.rc1"]
+        device_type: ["a3"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Free up disk space
+        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+        with:
+          tool-cache: true
+          docker-images: false
+
+      - name: Setup Docker buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            ${{ github.repository_owner }}/sglang
+          # push with schedule event
+          # push with workflow_dispatch event
+          tags: |
+            type=ref,event=pr
+            type=ref,event=branch
+            type=schedule,pattern=main
+          flavor: |
+            latest=false
+            suffix=-cann${{ matrix.cann_version }}-${{ matrix.device_type }},onlatest=true
+      # Login against a Docker registry except on PR
+      # https://github.com/docker/login-action
+      - name: Log into docker hub
+        uses: docker/login-action@v3
+        if: ${{ github.repository_owner == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      # Build and push Docker image with Buildx (don't push on PR)
+      # https://github.com/docker/build-push-action
+      - name: Build and push Docker image
+        id: build-and-push
+        uses: docker/build-push-action@v6
+        with:
+          context: docker
+          file: docker/Dockerfile.npu
+          # TODO: need add x86 platforms support when memfabric is ready
+          platforms: linux/arm64
+          labels: ${{ steps.meta.outputs.labels }}
+          tags: ${{ steps.meta.outputs.tags }}
+          push: ${{ github.repository_owner == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
+          provenance: false
+          build-args: |
+            CANN_VERSION=${{ matrix.cann_version }}
+            DEVICE_TYPE=${{ matrix.device_type }}
--- a/.github/workflows/release-docker-npu.yaml
+++ b/.github/workflows/release-docker-npu.yaml
@@ -0,0 +1,77 @@
+name: Release Docker Images (Ascend NPU)
+on:
+  push:
+    tags:
+      - "*" # Trigger on all tags and filterred by pep440 later
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - ".github/workflows/release-docker-npu.yaml"
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04-arm
+    strategy:
+      matrix:
+        cann_version: ["8.2.rc1"]
+        device_type: ["a3"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Free up disk space
+        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+        with:
+          tool-cache: true
+          docker-images: false
+
+        # push with tag
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            ${{ github.repository_owner }}/sglang
+          tags: |
+            type=ref,event=pr
+            type=ref,event=tag,suffix=-cann${{ matrix.cann_version }}-${{ matrix.device_type }}
+          flavor: |
+            latest=false
+      - name: Setup Docker buildx
+        uses: docker/setup-buildx-action@v3
+
+      # Login against a Docker registry except on PR
+      # https://github.com/docker/login-action
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        if: ${{ github.repository_owner == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Get version
+        id: get_version
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          echo "TAG=${{ github.repository_owner }}/sglang:v$version-cann${{ matrix.cann_version }}-${{ matrix.device_type }}" >> $GITHUB_OUTPUT
+          kernel_tag=$(curl -s https://api.github.com/repos/sgl-project/sgl-kernel-npu/tags | jq -r '.[0].name')
+          echo "KERNEL_NPU_TAG=${kernel_tag}" >> $GITHUB_OUTPUT
+
+      - name: Build and push Docker image
+        id: build-and-push
+        uses: docker/build-push-action@v6
+        with:
+          context: docker
+          file: docker/Dockerfile.npu
+          # TODO: need add x86 platforms support when memfabric is ready
+          platforms: linux/arm64
+          labels: ${{ steps.meta.outputs.labels }}
+          tags: ${{ steps.meta.outputs.tags || steps.get_version.outputs.TAG }}
+          push: ${{ github.repository_owner == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
+          provenance: false
+          build-args: |
+            SGLANG_KERNEL_NPU_TAG=${{ steps.get_version.outputs.KERNEL_NPU_TAG }}
+            CANN_VERSION=${{ matrix.cann_version }}
+            DEVICE_TYPE=${{ matrix.device_type }}
--- a/docker/Dockerfile.npu
+++ b/docker/Dockerfile.npu
@@ -0,0 +1,81 @@
+ARG CANN_VERSION=8.2.rc1
+ARG DEVICE_TYPE=a3
+ARG OS=ubuntu22.04
+ARG PYTHON_VERSION=py3.11
+
+FROM quay.io/ascend/cann:$CANN_VERSION-$DEVICE_TYPE-$OS-$PYTHON_VERSION
+
+# Update pip & apt sources
+ARG PIP_INDEX_URL="https://pypi.org/simple/"
+ARG APTMIRROR=""
+ARG MEMFABRIC_URL=https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl
+ARG PYTORCH_VERSION=2.6.0
+ARG TORCHVISION_VERSION=0.21.0
+ARG PTA_URL="https://gitee.com/ascend/pytorch/releases/download/v7.1.0.1-pytorch2.6.0/torch_npu-2.6.0.post1-cp311-cp311-manylinux_2_28_aarch64.whl"
+ARG VLLM_TAG=v0.8.5
+ARG TRITON_ASCEND_URL=https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/triton_ascend-3.2.0.dev20250729-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl
+ARG SGLANG_TAG=main
+ARG ASCEND_CANN_PATH=/usr/local/Ascend/ascend-toolkit
+ARG SGLANG_KERNEL_NPU_TAG=main
+
+WORKDIR /workspace
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN pip config set global.index-url $PIP_INDEX_URL
+RUN if [ -n "$APTMIRROR" ];then sed -i "s|.*.ubuntu.com|$APTMIRROR|g" /etc/apt/sources.list ;fi
+
+# Install development tools and utilities
+RUN apt-get update -y && apt upgrade -y && apt-get install -y \
+    build-essential \
+    cmake \
+    vim \
+    wget \
+    curl \
+    net-tools \
+    zlib1g-dev \
+    lld \
+    clang \
+    locales \
+    ccache \
+    ca-certificates \
+    && rm -rf /var/cache/apt/* \
+    && rm -rf /var/lib/apt/lists/* \
+    && update-ca-certificates \
+    && locale-gen en_US.UTF-8
+
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US:en
+ENV LC_ALL=en_US.UTF-8
+
+# Install dependencies
+# TODO: install from pypi released memfabric
+# TODO: install from pypi released triton-ascend
+RUN pip install $MEMFABRIC_URL --no-cache-dir \
+    && pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu --no-cache-dir \
+    && wget ${PTA_URL} && pip install "./torch_npu-2.6.0.post1-cp311-cp311-manylinux_2_28_aarch64.whl" --no-cache-dir \
+    && pip install ${TRITON_ASCEND_URL} --no-cache-dir \
+    && python3 -m pip install --no-cache-dir numpy==1.26.4 pybind11
+
+# Install vLLM
+RUN git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG && \
+    cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v . --no-cache-dir && \
+    cd .. && rm -rf vllm
+
+# Install SGLang
+RUN git clone https://github.com/sgl-project/sglang --branch $SGLANG_TAG && \
+    cd ./sglang/python && pip install .[srt_npu] --no-cache-dir && \
+    cd .. && rm -rf ./sglang
+
+# Install Deep-ep
+RUN git clone  --branch $SGLANG_KERNEL_NPU_TAG https://github.com/sgl-project/sgl-kernel-npu.git \
+    && export LD_LIBRARY_PATH=${ASCEND_CANN_PATH}/latest/runtime/lib64/stub:$LD_LIBRARY_PATH && \
+    source ${ASCEND_CANN_PATH}/set_env.sh && \
+    cd sgl-kernel-npu && \
+    bash build.sh \
+    && pip install output/deep_ep*.whl --no-cache-dir \
+    && cd .. && rm -rf sgl-kernel-npu \
+    && cd "$(pip show deep-ep | awk '/^Location:/ {print $2}')" && ln -s deep_ep/deep_ep_cpp*.so
+
+CMD ["/bin/bash"]
--- a/docs/basic_usage/deepseek.md
+++ b/docs/basic_usage/deepseek.md
@@ -24,6 +24,7 @@ To run DeepSeek V3/R1 models, the requirements are as follows:
 | **Quantized weights (int8)** | 16 x A100/800 |
 | | 32 x L40S |
 | | Xeon 6980P CPU |
+| | 2 x Atlas 800I A3 |

 <style>
 .md-typeset__table {
@@ -64,6 +65,7 @@ Detailed commands for reference:
 - [16 x A100 (int8)](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-16-a100a800-with-int8-quantization)
 - [32 x L40S (int8)](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-32-l40s-with-int8-quantization)
 - [Xeon 6980P CPU](../platforms/cpu_server.md#example-running-deepseek-r1)
+- [2 x Atlas 800I A3 (int8)](../platforms/ascend_npu.md#running-deepseek-v3)

 ### Download Weights
 If you encounter errors when starting the server, ensure the weights have finished downloading. It's recommended to download them beforehand or restart multiple times until all weights are downloaded. Please refer to [DeepSeek V3](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base#61-inference-with-deepseek-infer-demo-example-only) official guide to download the weights.
--- a/docs/platforms/ascend_npu.md
+++ b/docs/platforms/ascend_npu.md
@@ -1,7 +1,206 @@
-# Ascend NPUs
+# SGLang on Ascend NPUs

-## Install
-TODO
+You can install SGLang using any of the methods below. Please go through `System Settings` section to ensure the clusters are roaring at max performance. Feel free to leave an issue [here at sglang](https://github.com/sgl-project/sglang/issues) if you encounter any issues or have any problems.
+
+## System Settings
+
+### CPU performance power scheme
+
+The default power scheme on Ascend hardware is `ondemand` which could affect performance, changing it to `performance` is recommended.
+
+```shell
+echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+
+# Make sure changes are applied successfully
+cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor # shows performance
+```
+
+### Disable NUMA balancing
+
+```shell
+sudo sysctl -w kernel.numa_balancing=0
+
+# Check
+cat /proc/sys/kernel/numa_balancing # shows 0
+```
+
+### Prevent swapping out system memory
+
+```shell
+sudo sysctl -w vm.swappiness=10
+
+# Check
+cat /proc/sys/vm/swappiness # shows 10
+```
+
+## Installing SGLang
+
+### Method 1: Installing from source with prerequisites
+
+#### Python Version
+
+Only `python==3.11` is supported currently. If you don't want to break system pre-installed python, try installing with [conda](https://github.com/conda/conda).
+
+```shell
+conda create --name sglang_npu python=3.11
+conda activate sglang_npu
+```
+
+#### MemFabric Adaptor
+
+_TODO: MemFabric is still a working project yet open sourced til August/September, 2025. We will release it as prebuilt wheel package for now._
+
+_Notice: Prebuilt wheel package is based on `aarch64`, please leave an issue [here at sglang](https://github.com/sgl-project/sglang/issues) to let us know the requests for `amd64` build._
+
+MemFabric Adaptor is a drop-in replacement of Mooncake Transfer Engine that enables KV cache transfer on Ascend NPU clusters.
+
+```shell
+MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
+MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${MF_WHL_NAME}"
+wget -O "${MF_WHL_NAME}" "${MEMFABRIC_URL}" && pip install "./${MF_WHL_NAME}"
+```
+
+#### Pytorch and Pytorch Framework Adaptor on Ascend
+
+Only `torch==2.6.0` is supported currently due to NPUgraph and Triton-on-Ascend's limitation, however a more generalized version will be release by the end of September, 2025.
+
+```shell
+PYTORCH_VERSION=2.6.0
+TORCHVISION_VERSION=0.21.0
+pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu
+
+PTA_VERSION="v7.1.0.1-pytorch2.6.0"
+PTA_NAME="torch_npu-2.6.0.post1-cp311-cp311-manylinux_2_28_aarch64.whl"
+PTA_URL="https://gitee.com/ascend/pytorch/releases/download/${PTA_VERSION}/${PTA_WHL_NAME}"
+wget -O "${PTA_NAME}" "${PTA_URL}" && pip install "./${PTA_NAME}"
+```
+
+#### vLLM
+
+vLLM is still a major prerequisite on Ascend NPU. Because of `torch==2.6.0` limitation, only vLLM v0.8.5 is supported.
+
+```shell
+VLLM_TAG=v0.8.5
+git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG
+(cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v -e .)
+```
+
+#### Triton on Ascend
+
+_Notice:_ We recommend installing triton-ascend from source due to its rapid development, the version on PYPI can't keep up for know. This problem will be solved on Sep. 2025, afterwards `pip install` would be the one and only installing method.
+
+Please follow Triton-on-Ascend's [installation guide from source](https://gitee.com/ascend/triton-ascend#2%E6%BA%90%E4%BB%A3%E7%A0%81%E5%AE%89%E8%A3%85-triton-ascend) to install the latest `triton-ascend` package.
+
+#### DeepEP-compatible Library
+
+We are also providing a DeepEP-compatible Library as a drop-in replacement of deepseek-ai's DeepEP library, check the [installation guide](https://github.com/sgl-project/sgl-kernel-npu/blob/main/python/deep_ep/README.md).
+
+#### Installing SGLang from source
+
+```shell
+# Use the last release branch
+git clone -b v0.5.0rc0 https://github.com/sgl-project/sglang.git
+cd sglang
+
+pip install --upgrade pip
+pip install -e python[srt_npu]
+```
+
+### Method 2: Using docker
+
+__Notice:__ `--privileged` and `--network=host` are required by RDMA, which is typically needed by Ascend NPU clusters.
+
+__Notice:__ The following docker command is based on Atlas 800I A3 machines. If you are using Atlas 800I A2, make sure only `davinci[0-7]` are mapped into container.
+
+```shell
+# Clone the SGLang repository
+git clone https://github.com/sgl-project/sglang.git
+cd sglang/docker
+
+# Build the docker image
+docker build -t sglang-npu:main -f Dockerfile.npu .
+
+alias drun='docker run -it --rm --privileged --network=host --ipc=host --shm-size=16g \
+    --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 \
+    --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 \
+    --device=/dev/davinci8 --device=/dev/davinci9 --device=/dev/davinci10 --device=/dev/davinci11 \
+    --device=/dev/davinci12 --device=/dev/davinci13 --device=/dev/davinci14 --device=/dev/davinci15 \
+    --device=/dev/davinci_manager --device=/dev/hisi_hdc \
+    --volume /usr/local/sbin:/usr/local/sbin --volume /usr/local/Ascend/driver:/usr/local/Ascend/driver \
+    --volume /usr/local/Ascend/firmware:/usr/local/Ascend/firmware \
+    --volume /etc/ascend_install.info:/etc/ascend_install.info \
+    --volume /var/queue_schedule:/var/queue_schedule --volume ~/.cache/:/root/.cache/'
+
+drun --env "HF_TOKEN=<secret>" \
+    sglang-npu:main \
+    python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --attention-backend ascend --host 0.0.0.0 --port 30000
+```

 ## Examples
-TODO
+
+### Running DeepSeek-V3
+
+Running DeepSeek with PD disaggregation on 2 x Atlas 800I A3.
+Model weights could be found [here](https://modelers.cn/models/State_Cloud/Deepseek-R1-bf16-hfd-w8a8).
+
+Prefill:
+
+```shell
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+export ASCEND_MF_STORE_URL="tcp://<PREFILL_HOST_IP>:<PORT>"
+
+drun sglang-npu:main \
+    python3 -m sglang.launch_server --model-path State_Cloud/DeepSeek-R1-bf16-hfd-w8a8 \
+    --trust-remote-code \
+    --attention-backend ascend \
+    --mem-fraction-static 0.8 \
+    --quantization w8a8_int8 \
+    --tp-size 16 \
+    --dp-size 1 \
+    --nnodes 1 \
+    --node-rank 0 \
+    --disaggregation-mode prefill \
+    --disaggregation-bootstrap-port 6657 \
+    --disaggregation-transfer-backend ascend \
+    --dist-init-addr <PREFILL_HOST_IP>:6688 \
+    --host <PREFILL_HOST_IP> \
+    --port 8000
+```
+
+Decode:
+
+```shell
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+export ASCEND_MF_STORE_URL="tcp://<PREFILL_HOST_IP>:<PORT>"
+export HCCL_BUFFSIZE=200
+export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=24
+
+drun sglang-npu:main \
+    python3 -m sglang.launch_server --model-path State_Cloud/DeepSeek-R1-bf16-hfd-w8a8 \
+    --trust-remote-code \
+    --attention-backend ascend \
+    --mem-fraction-static 0.8 \
+    --quantization w8a8_int8 \
+    --enable-deepep-moe \
+    --deepep-mode low_latency \
+    --tp-size 16 \
+    --dp-size 1 \
+    --ep-size 16 \
+    --nnodes 1 \
+    --node-rank 0 \
+    --disaggregation-mode decode \
+    --disaggregation-transfer-backend ascend \
+    --dist-init-addr <DECODE_HOST_IP>:6688 \
+    --host <DECODE_HOST_IP> \
+    --port 8001
+```
+
+Mini_LB:
+
+```shell
+drun sglang-npu:main \
+    python -m sglang.srt.disaggregation.launch_lb \
+    --prefill http://<PREFILL_HOST_IP>:8000 \
+    --decode http://<DECODE_HOST_IP>:8001 \
+    --host 127.0.0.1 --port 5000
+```
--- a/scripts/ci/npu_ci_install_dependency.sh
+++ b/scripts/ci/npu_ci_install_dependency.sh
@@ -1,16 +1,9 @@
 #!/bin/bash
 set -euo pipefail

-CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
 PIP_INSTALL="pip install --no-cache-dir"


-# Update apt & pip sources
-sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
-pip config set global.index-url http://${CACHING_URL}/pypi/simple
-pip config set global.trusted-host ${CACHING_URL}
-
-
 # Install the required dependencies in CI.
 apt update -y && apt install -y \
    build-essential \
@@ -31,7 +24,7 @@ python3 -m ${PIP_INSTALL} --upgrade pip
 ### Download MemFabricV2
 MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
 MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${MF_WHL_NAME}"
-wget "${MEMFABRIC_URL}" && ${PIP_INSTALL} "./${MF_WHL_NAME}"
+wget -O "${MF_WHL_NAME}" "${MEMFABRIC_URL}" && ${PIP_INSTALL} "./${MF_WHL_NAME}"


 ### Install vLLM
@@ -43,16 +36,19 @@ git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG
 ### Install PyTorch and PTA
 PYTORCH_VERSION=2.6.0
 TORCHVISION_VERSION=0.21.0
-PTA_VERSION=2.6.0
 ${PIP_INSTALL} torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu
-${PIP_INSTALL} torch_npu==$PTA_VERSION
+
+PTA_VERSION="v7.1.0.1-pytorch2.6.0"
+PTA_NAME="torch_npu-2.6.0.post1-cp311-cp311-manylinux_2_28_aarch64.whl"
+PTA_URL="https://gitee.com/ascend/pytorch/releases/download/${PTA_VERSION}/${PTA_NAME}"
+wget -O "${PTA_NAME}" "${PTA_URL}" && ${PIP_INSTALL} "./${PTA_NAME}"


 ### Install Triton-Ascend
 TRITON_ASCEND_NAME="triton_ascend-3.2.0.dev20250729-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl"
 TRITON_ASCEND_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${TRITON_ASCEND_NAME}"
 ${PIP_INSTALL} attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11
-wget "${TRITON_ASCEND_URL}" && ${PIP_INSTALL} "./${TRITON_ASCEND_NAME}"
+wget -O "${TRITON_ASCEND_NAME}" "${TRITON_ASCEND_URL}" && ${PIP_INSTALL} "./${TRITON_ASCEND_NAME}"


 ### Install SGLang