[CI] Refactor to speedup image building and CI Installation (#6708)

### What this PR does / why we need it? 1. Refactor image workflow using cache-from to speedup builds ![build](https://github.com/user-attachments/assets/02135c12-0069-44f8-a3ec-5c2b4282448a) Simultaneously refactored all Dockerfiles by placing layers that rarely change before those that change frequently, improving build cache hit rate. 2. Refactor E2E test using vllm-ascend container images, to skip C compile while no C code are changed ![e2e](https://github.com/user-attachments/assets/49f5b166-0df3-41e1-8f71-b3bbbed17cfd) In this case, the job will only replace the source code of vllm-ascend and install `requirements-dev.txt`, saving about 10min before tests ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 9562912cea Signed-off-by: wjunLu <wjunlu217@gmail.com>
2026-02-28 09:06:00 +08:00
parent 5666ce03f5
commit 84b00695f8
13 changed files with 456 additions and 213 deletions
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -17,16 +17,71 @@ on:
        type: boolean

 jobs:
+  select-image:
+    runs-on: linux-aarch64-a2b3-0
+    outputs:
+      image: ${{ steps.select.outputs.image }}
+      image_a3: ${{ steps.select.outputs.image_a3 }}
+      image_310p: ${{ steps.select.outputs.image_310p }}
+    steps:
+      - name: Select image based on base branch
+        id: select
+        env:
+          IMAGE_NAMESPACE: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend
+        run: |
+          BRANCH="${{ github.base_ref }}"
+          BRANCH_TAG="${BRANCH//\//-}"
+          MAIN_IMAGE="${{ inputs.image }}"
+          MAIN_IMAGE_A3="${{ inputs.image }}-a3"
+          MAIN_IMAGE_310P="${{ inputs.image }}-310p"
+          if [ "$BRANCH_TAG" = "main" ]; then
+            echo "Target branch is main, using main images: ${MAIN_IMAGE} / ${MAIN_IMAGE_A3} / ${MAIN_IMAGE_310P}"
+            echo "image=${MAIN_IMAGE}" >> $GITHUB_OUTPUT
+            echo "image_a3=${MAIN_IMAGE_A3}" >> $GITHUB_OUTPUT
+            echo "image_310p=${MAIN_IMAGE_310P}" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          # while target branch is not main
+          BRANCH_IMAGE="${IMAGE_NAMESPACE}:${BRANCH_TAG}"
+          BRANCH_IMAGE_A3="${IMAGE_NAMESPACE}:${BRANCH_TAG}-a3"
+          BRANCH_IMAGE_310P="${IMAGE_NAMESPACE}:${BRANCH_TAG}-310p"
+          # Check if branch-specific A2 image exists via IMAGE_NAMESPACE, fallback to main if not
+          if docker manifest inspect "${BRANCH_IMAGE}" > /dev/null 2>&1; then
+            echo "Using branch image: ${BRANCH_IMAGE}"
+            echo "image=${BRANCH_IMAGE}" >> $GITHUB_OUTPUT
+          else
+            echo "Branch image not found, falling back to ${MAIN_IMAGE}"
+            echo "image=${MAIN_IMAGE}" >> $GITHUB_OUTPUT
+          fi
+          # Check if branch-specific A3 image exists via IMAGE_NAMESPACE, fallback to main if not
+          if docker manifest inspect "${BRANCH_IMAGE_A3}" > /dev/null 2>&1; then
+            echo "Using branch A3 image: ${BRANCH_IMAGE_A3}"
+            echo "image_a3=${BRANCH_IMAGE_A3}" >> $GITHUB_OUTPUT
+          else
+            echo "Branch A3 image not found, falling back to ${MAIN_IMAGE_A3}"
+            echo "image_a3=${MAIN_IMAGE_A3}" >> $GITHUB_OUTPUT
+          fi
+          # Check if branch-specific 310P image exists via IMAGE_NAMESPACE, fallback to main if not
+          if docker manifest inspect "${BRANCH_IMAGE_310P}" > /dev/null 2>&1; then
+            echo "Using branch 310P image: ${BRANCH_IMAGE_310P}"
+            echo "image_310p=${BRANCH_IMAGE_310P}" >> $GITHUB_OUTPUT
+          else
+            echo "Branch 310P image not found, falling back to ${MAIN_IMAGE_310P}"
+            echo "image_310p=${MAIN_IMAGE_310P}" >> $GITHUB_OUTPUT
+          fi
+
  e2e-light:
    name: singlecard-light
    if: ${{ inputs.type == 'light' }}
+    needs: [select-image]
    runs-on: linux-aarch64-a2b3-1
    strategy:
      fail-fast: false
      matrix:
        part: [0]
    container:
-      image: ${{ inputs.image }}
+      image: ${{ needs.select-image.outputs.image }}
      env:
        VLLM_LOGGING_LEVEL: ERROR
        VLLM_USE_MODELSCOPE: True
@@ -34,6 +89,8 @@ jobs:
    steps:
      - name: Checkout vllm-project/vllm-ascend repo
        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
      - name: Check npu and CANN info
        run: |
          npu-smi info
@@ -46,14 +103,11 @@ jobs:
          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
          apt-get update -y
          apt install git -y
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"

      - name: Install system dependencies
        run: |
          apt-get -y install `cat packages.txt`
-          apt-get -y install gcc g++ cmake libnuma-dev clang-15
-
-          update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20
-          update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20

      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v6
@@ -63,19 +117,45 @@ jobs:
          path: ./vllm-empty
          fetch-depth: 1

-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
+      - name: Install vllm-project/vllm
        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
+          pip uninstall -y vllm
+          rm -rf /vllm-workspace/vllm
+          cp -r ./vllm-empty /vllm-workspace/vllm
+          VLLM_TARGET_DEVICE=empty pip install -v -e /vllm-workspace/vllm/

      - name: Install vllm-project/vllm-ascend
        env:
          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
        run: |
-          pip install -r requirements-dev.txt
-          pip install -v -e .
+          DEST="/vllm-workspace/vllm-ascend"
+          # rebase commits
+          git config user.email "action@github.com"
+          git config user.name "GitHub Action"
+          git fetch origin ${{ github.base_ref }}
+          git rebase origin/${{ github.base_ref }}
+
+          IMAGE_SHA=$(git -C "${DEST}" log -1 --format=%H 2>/dev/null || echo "")
+          cp -rT . "${DEST}/"
+          if [ -n "$IMAGE_SHA" ] && git cat-file -e "${IMAGE_SHA}" 2>/dev/null; then
+            C_CHANGES=$(git diff "${IMAGE_SHA}"..HEAD --name-only -- \
+              csrc/ cmake/ CMakeLists.txt setup.py requirements.txt requirements-dev.txt)
+            echo "[debug] C_CHANGES=${C_CHANGES:-<empty>}"
+          else
+            echo "[debug] IMAGE_SHA not found in local history (empty or unreachable), forcing reinstall"
+            C_CHANGES="yes"
+          fi
+          
+          pip install -r ${DEST}/requirements-dev.txt
+          if [ -n "$C_CHANGES" ]; then
+            echo "[debug] C code / build changes detected, reinstalling vllm-ascend..."
+            pip install -v -e "${DEST}/"
+          else
+            echo "[debug] No C code / build changes detected, skipping reinstall."
+          fi

      - name: Run vllm-project/vllm-ascend test
+        working-directory: /vllm-workspace/vllm-ascend
        env:
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
          VLLM_WORKER_MULTIPROC_METHOD: spawn
@@ -85,13 +165,14 @@ jobs:
  e2e-full:
    name: singlecard-full
    if: ${{ inputs.type == 'full' }}
+    needs: [select-image]
    runs-on: linux-aarch64-a2b3-1
    strategy:
      fail-fast: false
      matrix:
        part: [0, 1]
    container:
-      image: ${{ inputs.image }}
+      image: ${{ needs.select-image.outputs.image }}
      env:
        VLLM_LOGGING_LEVEL: ERROR
        VLLM_USE_MODELSCOPE: True
@@ -100,6 +181,8 @@ jobs:
    steps:
      - name: Checkout vllm-project/vllm-ascend repo
        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0

      - name: Check npu and CANN info
        run: |
@@ -113,14 +196,11 @@ jobs:
          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
          apt-get update -y
          apt install git -y
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"

      - name: Install system dependencies
        run: |
          apt-get -y install `cat packages.txt`
-          apt-get -y install gcc g++ cmake libnuma-dev clang-15
-
-          update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20
-          update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20

      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v6
@@ -130,18 +210,44 @@ jobs:
          path: ./vllm-empty
          fetch-depth: 1

-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
+      - name: Install vllm-project/vllm
        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
+          pip uninstall -y vllm
+          rm -rf /vllm-workspace/vllm
+          cp -r ./vllm-empty /vllm-workspace/vllm
+          VLLM_TARGET_DEVICE=empty pip install -v -e /vllm-workspace/vllm/

      - name: Install vllm-project/vllm-ascend
        env:
          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
        run: |
-          pip install -r requirements-dev.txt
-          pip install -v -e .
+          DEST="/vllm-workspace/vllm-ascend"
+          # rebase commits
+          git config user.email "action@github.com"
+          git config user.name "GitHub Action"
+          git fetch origin ${{ github.base_ref }}
+          git rebase origin/${{ github.base_ref }}
+
+          IMAGE_SHA=$(git -C "${DEST}" log -1 --format=%H 2>/dev/null || echo "")
+          cp -rT . "${DEST}/"
+          if [ -n "$IMAGE_SHA" ] && git cat-file -e "${IMAGE_SHA}" 2>/dev/null; then
+            C_CHANGES=$(git diff "${IMAGE_SHA}"..HEAD --name-only -- \
+              csrc/ cmake/ CMakeLists.txt setup.py requirements.txt requirements-dev.txt)
+            echo "[debug] C_CHANGES=${C_CHANGES:-<empty>}"
+          else
+            echo "[debug] IMAGE_SHA not found in local history (empty or unreachable), forcing reinstall"
+            C_CHANGES="yes"
+          fi
+          pip install -r ${DEST}/requirements-dev.txt
+          if [ -n "$C_CHANGES" ]; then
+            echo "[debug] C code / build changes detected, reinstalling vllm-ascend..."
+            pip install -v -e "${DEST}/"
+          else
+            echo "[debug] No C code / build changes detected, skipping reinstall."
+          fi
+
      - name: Run e2e test
+        working-directory: /vllm-workspace/vllm-ascend
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
@@ -151,13 +257,14 @@ jobs:
  e2e-2-cards-light:
    name: multicard-2-light
    if: ${{ inputs.type == 'light' }}
+    needs: [select-image]
    runs-on: linux-aarch64-a3-2
    strategy:
      fail-fast: false
      matrix:
        part: [0]
    container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-a3-ubuntu22.04-py3.11
+      image: ${{ needs.select-image.outputs.image_a3 }}
      env:
        VLLM_LOGGING_LEVEL: ERROR
        VLLM_USE_MODELSCOPE: True
@@ -166,6 +273,8 @@ jobs:
    steps:
      - name: Checkout vllm-project/vllm-ascend repo
        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
      - name: Check npu and CANN info
        run: |
          npu-smi info
@@ -178,14 +287,11 @@ jobs:
          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
          apt-get update -y
          apt install git -y
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"

      - name: Install system dependencies
        run: |
          apt-get -y install `cat packages.txt`
-          apt-get -y install gcc g++ cmake libnuma-dev clang-15
-
-          update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20
-          update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20

      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v6
@@ -195,18 +301,44 @@ jobs:
          path: ./vllm-empty
          fetch-depth: 1

-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
+      - name: Install vllm-project/vllm
        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
+          pip uninstall -y vllm
+          rm -rf /vllm-workspace/vllm
+          cp -r ./vllm-empty /vllm-workspace/vllm
+          VLLM_TARGET_DEVICE=empty pip install -v -e /vllm-workspace/vllm/

      - name: Install vllm-project/vllm-ascend
        env:
          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
        run: |
-          pip install -r requirements-dev.txt
-          pip install -v -e .
+          DEST="/vllm-workspace/vllm-ascend"
+          # rebase commits
+          git config user.email "action@github.com"
+          git config user.name "GitHub Action"
+          git fetch origin ${{ github.base_ref }}
+          git rebase origin/${{ github.base_ref }}
+
+          IMAGE_SHA=$(git -C "${DEST}" log -1 --format=%H 2>/dev/null || echo "")
+          cp -rT . "${DEST}/"
+          if [ -n "$IMAGE_SHA" ] && git cat-file -e "${IMAGE_SHA}" 2>/dev/null; then
+            C_CHANGES=$(git diff "${IMAGE_SHA}"..HEAD --name-only -- \
+              csrc/ cmake/ CMakeLists.txt setup.py requirements.txt requirements-dev.txt)
+            echo "[debug] C_CHANGES=${C_CHANGES:-<empty>}"
+          else
+            echo "[debug] IMAGE_SHA not found in local history (empty or unreachable), forcing reinstall"
+            C_CHANGES="yes"
+          fi
+          pip install -r ${DEST}/requirements-dev.txt
+          if [ -n "$C_CHANGES" ]; then
+            echo "[debug] C code / build changes detected, reinstalling vllm-ascend..."
+            pip install -v -e "${DEST}/"
+          else
+            echo "[debug] No C code / build changes detected, skipping reinstall."
+          fi
+
      - name: Run vllm-project/vllm-ascend test (light)
+        working-directory: /vllm-workspace/vllm-ascend
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
        run: |
@@ -215,13 +347,14 @@ jobs:
  e2e-2-cards-full:
    name: multicard-2-full
    if: ${{ inputs.type == 'full' }}
+    needs: [select-image]
    runs-on: linux-aarch64-a3-2
    strategy:
      fail-fast: false
      matrix:
        part: [0]
    container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-a3-ubuntu22.04-py3.11
+      image: ${{ needs.select-image.outputs.image_a3 }}
      env:
        VLLM_LOGGING_LEVEL: ERROR
        VLLM_USE_MODELSCOPE: True
@@ -230,6 +363,8 @@ jobs:
    steps:
      - name: Checkout vllm-project/vllm-ascend repo
        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
      - name: Check npu and CANN info
        run: |
          npu-smi info
@@ -242,14 +377,11 @@ jobs:
          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
          apt-get update -y
          apt install git -y
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"

      - name: Install system dependencies
        run: |
          apt-get -y install `cat packages.txt`
-          apt-get -y install gcc g++ cmake libnuma-dev clang-15
-
-          update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20
-          update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20

      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v6
@@ -259,18 +391,44 @@ jobs:
          path: ./vllm-empty
          fetch-depth: 1

-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
+      - name: Install vllm-project/vllm
        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
+          pip uninstall -y vllm
+          rm -rf /vllm-workspace/vllm
+          cp -r ./vllm-empty /vllm-workspace/vllm
+          VLLM_TARGET_DEVICE=empty pip install -v -e /vllm-workspace/vllm/

      - name: Install vllm-project/vllm-ascend
        env:
          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
        run: |
-          pip install -r requirements-dev.txt
-          pip install -v -e .
+          DEST="/vllm-workspace/vllm-ascend"
+          # rebase commits
+          git config user.email "action@github.com"
+          git config user.name "GitHub Action"
+          git fetch origin ${{ github.base_ref }}
+          git rebase origin/${{ github.base_ref }}
+
+          IMAGE_SHA=$(git -C "${DEST}" log -1 --format=%H 2>/dev/null || echo "")
+          cp -rT . "${DEST}/"
+          if [ -n "$IMAGE_SHA" ] && git cat-file -e "${IMAGE_SHA}" 2>/dev/null; then
+            C_CHANGES=$(git diff "${IMAGE_SHA}"..HEAD --name-only -- \
+              csrc/ cmake/ CMakeLists.txt setup.py requirements.txt requirements-dev.txt)
+            echo "[debug] C_CHANGES=${C_CHANGES:-<empty>}"
+          else
+            echo "[debug] IMAGE_SHA not found in local history (empty or unreachable), forcing reinstall"
+            C_CHANGES="yes"
+          fi
+          pip install -r ${DEST}/requirements-dev.txt
+          if [ -n "$C_CHANGES" ]; then
+            echo "[debug] C code / build changes detected, reinstalling vllm-ascend..."
+            pip install -v -e "${DEST}/"
+          else
+            echo "[debug] No C code / build changes detected, skipping reinstall."
+          fi
+
      - name: Run vllm-project/vllm-ascend test (full)
+        working-directory: /vllm-workspace/vllm-ascend
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
        run: |
@@ -278,6 +436,7 @@ jobs:

      - name: Run vllm-project/vllm-ascend test (non triton)
        if: ${{ inputs.type == 'full' && matrix.part == 0 }}
+        working-directory: /vllm-workspace/vllm-ascend
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
        run: |
@@ -287,13 +446,14 @@ jobs:
  e2e-4-cards-full:
    name: multicard-4-full
    if: ${{ inputs.type == 'full' }}
+    needs: [select-image]
    runs-on: linux-aarch64-a3-4
    strategy:
      fail-fast: false
      matrix:
        part: [0]
    container:
-      image: m.daocloud.io/quay.io/ascend/cann:8.5.0-a3-ubuntu22.04-py3.11
+      image: ${{ needs.select-image.outputs.image_a3 }}
      env:
        VLLM_LOGGING_LEVEL: ERROR
        VLLM_USE_MODELSCOPE: True
@@ -301,6 +461,8 @@ jobs:
    steps:
      - name: Checkout vllm-project/vllm-ascend repo
        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
      - name: Check npu and CANN info
        run: |
          npu-smi info
@@ -313,14 +475,11 @@ jobs:
          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
          apt-get update -y
          apt install git -y
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"

      - name: Install system dependencies
        run: |
          apt-get -y install `cat packages.txt`
-          apt-get -y install gcc g++ cmake libnuma-dev clang-15
-
-          update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20
-          update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20

      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v6
@@ -330,19 +489,44 @@ jobs:
          path: ./vllm-empty
          fetch-depth: 1

-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
+      - name: Install vllm-project/vllm
        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
+          pip uninstall -y vllm
+          rm -rf /vllm-workspace/vllm
+          cp -r ./vllm-empty /vllm-workspace/vllm
+          VLLM_TARGET_DEVICE=empty pip install -v -e /vllm-workspace/vllm/

      - name: Install vllm-project/vllm-ascend
        env:
          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
        run: |
-          pip install -r requirements-dev.txt
-          pip install -v -e .
+          DEST="/vllm-workspace/vllm-ascend"
+          # rebase commits
+          git config user.email "action@github.com"
+          git config user.name "GitHub Action"
+          git fetch origin ${{ github.base_ref }}
+          git rebase origin/${{ github.base_ref }}
+
+          IMAGE_SHA=$(git -C "${DEST}" log -1 --format=%H 2>/dev/null || echo "")
+          cp -rT . "${DEST}/"
+          if [ -n "$IMAGE_SHA" ] && git cat-file -e "${IMAGE_SHA}" 2>/dev/null; then
+            C_CHANGES=$(git diff "${IMAGE_SHA}"..HEAD --name-only -- \
+              csrc/ cmake/ CMakeLists.txt setup.py requirements.txt requirements-dev.txt)
+            echo "[debug] C_CHANGES=${C_CHANGES:-<empty>}"
+          else
+            echo "[debug] IMAGE_SHA not found in local history (empty or unreachable), forcing reinstall"
+            C_CHANGES="yes"
+          fi
+          pip install -r ${DEST}/requirements-dev.txt
+          if [ -n "$C_CHANGES" ]; then
+            echo "[debug] C code / build changes detected, reinstalling vllm-ascend..."
+            pip install -v -e "${DEST}/"
+          else
+            echo "[debug] No C code / build changes detected, skipping reinstall."
+          fi

      - name: Run vllm-project/vllm-ascend test for V1 Engine
+        working-directory: /vllm-workspace/vllm-ascend
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
        run: |
@@ -352,17 +536,24 @@ jobs:
    name: 310p singlecard
    runs-on: linux-aarch64-310p-1
    if: ${{ inputs.contains_310 }}
+    needs: [select-image]
    container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-310p-ubuntu22.04-py3.11
+      image: ${{ needs.select-image.outputs.image_310p }}
      env:
        VLLM_LOGGING_LEVEL: ERROR
        VLLM_USE_MODELSCOPE: True
        HF_HUB_OFFLINE: 1
    steps:
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
      - name: Check npu and CANN info
        run: |
          npu-smi info
          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
      - name: Config mirrors
        run: |
          sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
@@ -370,14 +561,11 @@ jobs:
          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
          apt-get update -y
          apt install git -y
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v6
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"

      - name: Install system dependencies
        run: |
          apt-get -y install `cat packages.txt`
-          apt-get -y install gcc g++ cmake libnuma-dev

      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v6
@@ -387,19 +575,44 @@ jobs:
          path: ./vllm-empty
          fetch-depth: 1

-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
+      - name: Install vllm-project/vllm
        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
+          pip uninstall -y vllm
+          rm -rf /vllm-workspace/vllm
+          cp -r ./vllm-empty /vllm-workspace/vllm
+          VLLM_TARGET_DEVICE=empty pip install -v -e /vllm-workspace/vllm/

      - name: Install vllm-project/vllm-ascend
        env:
          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
        run: |
-          pip install -r requirements-dev.txt
-          pip install -v -e .
+          DEST="/vllm-workspace/vllm-ascend"
+          # rebase commits
+          git config user.email "action@github.com"
+          git config user.name "GitHub Action"
+          git fetch origin ${{ github.base_ref }}
+          git rebase origin/${{ github.base_ref }}
+
+          IMAGE_SHA=$(git -C "${DEST}" log -1 --format=%H 2>/dev/null || echo "")
+          cp -rT . "${DEST}/"
+          if [ -n "$IMAGE_SHA" ] && git cat-file -e "${IMAGE_SHA}" 2>/dev/null; then
+            C_CHANGES=$(git diff "${IMAGE_SHA}"..HEAD --name-only -- \
+              csrc/ cmake/ CMakeLists.txt setup.py requirements.txt requirements-dev.txt)
+            echo "[debug] C_CHANGES=${C_CHANGES:-<empty>}"
+          else
+            echo "[debug] IMAGE_SHA not found in local history (empty or unreachable), forcing reinstall"
+            C_CHANGES="yes"
+          fi
+          pip install -r ${DEST}/requirements-dev.txt
+          if [ -n "$C_CHANGES" ]; then
+            echo "[debug] C code / build changes detected, reinstalling vllm-ascend..."
+            pip install -v -e "${DEST}/"
+          else
+            echo "[debug] No C code / build changes detected, skipping reinstall."
+          fi

      - name: Run vllm-project/vllm-ascend test
+        working-directory: /vllm-workspace/vllm-ascend
        env:
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
          VLLM_WORKER_MULTIPROC_METHOD: spawn
@@ -410,17 +623,24 @@ jobs:
    name: 310p multicards 4cards
    runs-on: linux-aarch64-310p-4
    if: ${{ inputs.contains_310 }}
+    needs: [select-image]
    container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-310p-ubuntu22.04-py3.11
+      image: ${{ needs.select-image.outputs.image_310p }}
      env:
        VLLM_LOGGING_LEVEL: ERROR
        VLLM_USE_MODELSCOPE: True
        HF_HUB_OFFLINE: 1
    steps:
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
      - name: Check npu and CANN info
        run: |
          npu-smi info
          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
      - name: Config mirrors
        run: |
          sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
@@ -428,14 +648,11 @@ jobs:
          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
          apt-get update -y
          apt install git -y
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v6
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"

      - name: Install system dependencies
        run: |
          apt-get -y install `cat packages.txt`
-          apt-get -y install gcc g++ cmake libnuma-dev

      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v6
@@ -445,19 +662,44 @@ jobs:
          path: ./vllm-empty
          fetch-depth: 1

-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
+      - name: Install vllm-project/vllm
        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
+          pip uninstall -y vllm
+          rm -rf /vllm-workspace/vllm
+          cp -r ./vllm-empty /vllm-workspace/vllm
+          VLLM_TARGET_DEVICE=empty pip install -v -e /vllm-workspace/vllm/

      - name: Install vllm-project/vllm-ascend
        env:
          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
        run: |
-          pip install -r requirements-dev.txt
-          pip install -v -e .
+          DEST="/vllm-workspace/vllm-ascend"
+          # rebase commits
+          git config user.email "action@github.com"
+          git config user.name "GitHub Action"
+          git fetch origin ${{ github.base_ref }}
+          git rebase origin/${{ github.base_ref }}
+
+          IMAGE_SHA=$(git -C "${DEST}" log -1 --format=%H 2>/dev/null || echo "")
+          cp -rT . "${DEST}/"
+          if [ -n "$IMAGE_SHA" ] && git cat-file -e "${IMAGE_SHA}" 2>/dev/null; then
+            C_CHANGES=$(git diff "${IMAGE_SHA}"..HEAD --name-only -- \
+              csrc/ cmake/ CMakeLists.txt setup.py requirements.txt requirements-dev.txt)
+            echo "[debug] C_CHANGES=${C_CHANGES:-<empty>}"
+          else
+            echo "[debug] IMAGE_SHA not found in local history (empty or unreachable), forcing reinstall"
+            C_CHANGES="yes"
+          fi
+          pip install -r ${DEST}/requirements-dev.txt
+          if [ -n "$C_CHANGES" ]; then
+            echo "[debug] C code / build changes detected, reinstalling vllm-ascend..."
+            pip install -v -e "${DEST}/"
+          else
+            echo "[debug] No C code / build changes detected, skipping reinstall."
+          fi

      - name: Run vllm-project/vllm-ascend test
+        working-directory: /vllm-workspace/vllm-ascend
        env:
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
          VLLM_WORKER_MULTIPROC_METHOD: spawn
--- a/.github/workflows/_schedule_image_build.yaml
+++ b/.github/workflows/_schedule_image_build.yaml
@@ -76,6 +76,26 @@ jobs:
        driver: docker-container
        use: true

+    - name: Set cache ref
+      id: cache
+      run: |
+        if [ "${{ github.ref_type }}" = "tag" ]; then
+          # For tag events, use the images built from source branch as cache (the tag image doesn't exist yet).
+          if [ -z "$branch" ]; then
+            branch=$(git branch -r --contains HEAD \
+              | grep -v 'HEAD' \
+              | sed 's|[[:space:]]*origin/||' \
+              | head -1)
+          fi
+          branch="${branch:-main}"
+        else
+          # For branch push / schedule / workflow_dispatch, use the triggering branch name
+          branch="${{ github.ref_name }}"
+        fi
+        # Replace / with - for use in image tags
+        branch="${branch//\//-}"
+        echo "ref=quay.io/ascend/vllm-ascend:${branch}-${{ inputs.suffix }}" >> $GITHUB_OUTPUT
+
    - name: Build and push
      uses: docker/build-push-action@v6
      id: build
@@ -89,6 +109,8 @@ jobs:
        outputs: type=image,name=quay.io/ascend/vllm-ascend,push-by-digest=true,name-canonical=true,push=${{ inputs.should_push }}
        build-args: |
          PIP_INDEX_URL=https://pypi.org/simple
+        # use previously pushed multi-arch image as cache to speed up builds
+        cache-from: type=registry,ref=${{ steps.cache.outputs.ref }}
        provenance: false

    - name: Export digest
@@ -154,6 +176,7 @@ jobs:
          #      which follow the rule from vLLM with prefix v
          # TODO(yikun): the post release might be considered as latest release
          tags: |
+            type=branch,suffix=${{ env.SUFFIX }}
            type=pep440,pattern={{raw}},suffix=${{ env.SUFFIX }}
            type=schedule,pattern=main,suffix=${{ env.SUFFIX }}
            type=raw,value=${{ inputs.workflow_dispatch_tag }},enable=${{ github.event_name == 'workflow_dispatch' }},suffix=${{ env.SUFFIX }}
--- a/.github/workflows/pr_test_full.yaml
+++ b/.github/workflows/pr_test_full.yaml
@@ -81,6 +81,6 @@ jobs:
    uses: ./.github/workflows/_e2e_test.yaml
    with:
      vllm: ${{ matrix.vllm_version }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:main
      contains_310: false
      type: full
--- a/.github/workflows/pr_test_light.yaml
+++ b/.github/workflows/pr_test_light.yaml
@@ -107,6 +107,6 @@ jobs:
    uses: ./.github/workflows/_e2e_test.yaml
    with:
      vllm: ${{ matrix.vllm_version }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:main
      contains_310: ${{ needs.changes.outputs._310_tracker == 'true' }}
      type: light
--- a/.github/workflows/schedule_image_build_and_push.yaml
+++ b/.github/workflows/schedule_image_build_and_push.yaml
@@ -12,12 +12,15 @@
 name: Image Build and Push
 on:
  schedule:
-    # UTC+8: 8am, 12pm, 16pm, 22pm
-    - cron: '0 0,4,8,14 * * *'
+    # UTC+8: 8am, 10am, 12pm, 14pm, 16pm, 18pm, 22pm
+    - cron: '0 0,2,4,6,8,10,14 * * *'
  push:
+    branches:
+      # Build release branch images proactively so cache is warm when the tag is pushed
+      - 'releases/*'
    tags:
      - 'v*'
-  pull_request: 
+  pull_request:
    branches:
      - 'main'
    types: [ labeled, synchronize ]
--- a/.github/workflows/schedule_nightly_image_build.yaml
+++ b/.github/workflows/schedule_nightly_image_build.yaml
@@ -50,6 +50,7 @@ jobs:
            --build-arg CANN_VERSION="8.5.0" \
            --build-arg UBUNTU_VERSION="22.04" \
            --build-arg PYTHON_VERSION="3.11" \
+            --cache-from "$IMAGE_TAG" \
            -t "$IMAGE_TAG" .

          echo "image-tag=$IMAGE_TAG" >> $GITHUB_OUTPUT
--- a/.github/workflows/schedule_test_vllm_main.yaml
+++ b/.github/workflows/schedule_test_vllm_main.yaml
@@ -34,6 +34,6 @@ jobs:
    uses: ./.github/workflows/_e2e_test.yaml
    with:
      vllm: main
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:main
      contains_310: false
      type: full
--- a/50
+++ b/50
@@ -19,46 +19,49 @@ FROM quay.io/ascend/cann:8.5.0-910b-ubuntu22.04-py3.11

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG MOONCAKE_TAG="v0.3.8.post1"
-ARG SOC_VERSION="ascend910b1"
-
-# Define environments
-ENV DEBIAN_FRONTEND=noninteractive
-ENV SOC_VERSION=$SOC_VERSION \
-    TASK_QUEUE_ENABLE=1 \
-    OMP_NUM_THREADS=1

 WORKDIR /workspace

-COPY . /vllm-workspace/vllm-ascend/
+COPY ./tools/mooncake_installer.sh /vllm-workspace/

-# Install Mooncake dependencies
+# Install clang-15 (for triton-ascend) and Mooncake
 RUN apt-get update -y && \
-    apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \
+    apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 clang-15 && \
+    update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20 && \
+    update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20 && \
    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
-    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
+    mv /vllm-workspace/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
    cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
    ARCH=$(uname -m) && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/devlib:/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/lib64:$LD_LIBRARY_PATH && \
    mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
    make -j$(nproc) && make install && \
-    rm -fr /vllm-workspace/Mooncake/build && \
+    rm -rf /vllm-workspace/Mooncake/build && \
    rm -rf /var/cache/apt/* && \
    rm -rf /var/lib/apt/lists/*

-RUN pip config set global.index-url ${PIP_INDEX_URL}
+# Install modelscope (for fast download) and ray (for multinode)
+RUN pip config set global.index-url ${PIP_INDEX_URL} && \
+    python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.16.0
-RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
-# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm && \
+    VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge

 # Install vllm-ascend
-# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
+ARG SOC_VERSION="ascend910b1"
+ENV DEBIAN_FRONTEND=noninteractive
+ENV SOC_VERSION=$SOC_VERSION \
+    TASK_QUEUE_ENABLE=1 \
+    OMP_NUM_THREADS=1
+COPY . /vllm-workspace/vllm-ascend/
+
 RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
@@ -66,18 +69,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge

-# Install clang-15 (for triton-ascend)
-RUN apt-get update -y && \
-    apt-get -y install clang-15 && \
-    update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20 && \
-    update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20 && \
-    rm -rf /var/cache/apt/* && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install modelscope (for fast download) and ray (for multinode)
-RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
-    python3 -m pip cache purge
-
+# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
 RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc

 CMD ["/bin/bash"]
--- a/Dockerfile.310p
+++ b/Dockerfile.310p
@@ -18,37 +18,36 @@
 FROM quay.io/ascend/cann:8.5.0-310p-ubuntu22.04-py3.11

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
-ARG SOC_VERSION="ascend310p1"

-# Define environments
-ENV DEBIAN_FRONTEND=noninteractive
-ENV SOC_VERSION=$SOC_VERSION \
-    TASK_QUEUE_ENABLE=1 \
-    OMP_NUM_THREADS=1
-    
+WORKDIR /workspace

 RUN apt-get update -y && \
    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \
    rm -rf /var/cache/apt/* && \
    rm -rf /var/lib/apt/lists/*

-WORKDIR /workspace
-
-COPY . /vllm-workspace/vllm-ascend/
-
-RUN pip config set global.index-url ${PIP_INDEX_URL}
+# Install modelscope (for fast download) and ray (for multinode)
+RUN pip config set global.index-url ${PIP_INDEX_URL} && \
+    python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.16.0
-RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
-# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm && \
+    # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+    VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge

 # Install vllm-ascend
-# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
+ARG SOC_VERSION="ascend310p1"
+ENV DEBIAN_FRONTEND=noninteractive
+ENV SOC_VERSION=$SOC_VERSION \
+    TASK_QUEUE_ENABLE=1 \
+    OMP_NUM_THREADS=1
+COPY . /vllm-workspace/vllm-ascend/
+
 RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
@@ -56,10 +55,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge

-# Install modelscope (for fast download) and ray (for multinode)
-RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
-    python3 -m pip cache purge
-
+# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
 RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc

 CMD ["/bin/bash"]
--- a/Dockerfile.310p.openEuler
+++ b/Dockerfile.310p.openEuler
@@ -18,32 +18,34 @@
 FROM quay.io/ascend/cann:8.5.0-310p-openeuler24.03-py3.11

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
-ARG SOC_VERSION="ascend310p1"

-ENV SOC_VERSION=$SOC_VERSION \
-    TASK_QUEUE_ENABLE=1 \
-    OMP_NUM_THREADS=1
+WORKDIR /workspace

 RUN yum update -y && \
    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \
    rm -rf /var/cache/yum

-RUN pip config set global.index-url ${PIP_INDEX_URL}
-
-WORKDIR /workspace
-
-COPY . /vllm-workspace/vllm-ascend/
+# Install modelscope (for fast download) and ray (for multinode)
+RUN pip config set global.index-url ${PIP_INDEX_URL} && \
+    python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.16.0
-RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
-# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm && \
+    # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+    VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge

 # Install vllm-ascend
+ARG SOC_VERSION="ascend310p1"
+ENV SOC_VERSION=$SOC_VERSION \
+    TASK_QUEUE_ENABLE=1 \
+    OMP_NUM_THREADS=1
+COPY . /vllm-workspace/vllm-ascend/
+
 RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
@@ -52,10 +54,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge

-# Install modelscope (for fast download) and ray (for multinode)
-RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
-    python3 -m pip cache purge
-
 RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc

 CMD ["/bin/bash"]
--- a/Dockerfile.a3
+++ b/Dockerfile.a3
@@ -19,24 +19,20 @@ FROM quay.io/ascend/cann:8.5.0-a3-ubuntu22.04-py3.11

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG MOONCAKE_TAG=v0.3.8.post1
-ARG SOC_VERSION="ascend910_9391"

-COPY . /vllm-workspace/vllm-ascend/
-# Define environments
+COPY ./tools/mooncake_installer.sh /vllm-workspace/
+
 ENV DEBIAN_FRONTEND=noninteractive
-ENV SOC_VERSION=$SOC_VERSION \
-    TASK_QUEUE_ENABLE=1 \
-    OMP_NUM_THREADS=1
-
-RUN pip config set global.index-url ${PIP_INDEX_URL}

 WORKDIR /workspace

-# Install Mooncake dependencies
+# Install clang-15 (for triton-ascend) and Mooncake
 RUN apt-get update -y && \
-    apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 && \
+    apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev libjemalloc2 clang-15 && \
+    update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20 && \
+    update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20 && \
    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
-    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
+    mv /vllm-workspace/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
    cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
    ARCH=$(uname -m) && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
@@ -47,17 +43,28 @@ RUN apt-get update -y && \
    rm -rf /var/cache/apt/* && \
    rm -rf /var/lib/apt/lists/*

+# Install modelscope (for fast download) and ray (for multinode)
+RUN pip config set global.index-url ${PIP_INDEX_URL} && \
+    python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.16.0
-RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
-# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm && \
+    # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+    VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge

 # Install vllm-ascend
-# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
+ARG SOC_VERSION="ascend910_9391"
+ENV DEBIAN_FRONTEND=noninteractive
+ENV SOC_VERSION=$SOC_VERSION \
+    TASK_QUEUE_ENABLE=1 \
+    OMP_NUM_THREADS=1
+COPY . /vllm-workspace/vllm-ascend/
+
 RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
@@ -65,18 +72,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge

-# Install clang-15 (for triton-ascend)
-RUN apt-get update -y && \
-    apt-get -y install clang-15 && \
-    update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20 && \
-    update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20 && \
-    rm -rf /var/cache/apt/* && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install modelscope (for fast download) and ray (for multinode)
-RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
-    python3 -m pip cache purge
-
+# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
 RUN echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc

 CMD ["/bin/bash"]
--- a/Dockerfile.a3.openEuler
+++ b/Dockerfile.a3.openEuler
@@ -19,24 +19,18 @@ FROM quay.io/ascend/cann:8.5.0-a3-openeuler24.03-py3.11

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG MOONCAKE_TAG="v0.3.8.post1"
-ARG SOC_VERSION="ascend910_9391"
-
-ENV SOC_VERSION=$SOC_VERSION \
-    TASK_QUEUE_ENABLE=1 \
-    OMP_NUM_THREADS=1
-
-RUN pip config set global.index-url ${PIP_INDEX_URL}

 WORKDIR /workspace

-COPY . /vllm-workspace/vllm-ascend/
+COPY ./tools/mooncake_installer.sh /vllm-workspace/vllm-ascend/

 SHELL ["/bin/bash", "-c"]

+# Install clang (for triton-ascend) and Mooncake
 RUN yum update -y && \
-    yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \
+    yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc clang && \
    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
-    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
+    cp /vllm-workspace/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
    ARCH=$(uname -m) && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/devlib:/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/lib64:$LD_LIBRARY_PATH && \
@@ -48,16 +42,27 @@ RUN yum update -y && \
    rm -fr /vllm-workspace/Mooncake/build && \
    rm -rf /var/cache/yum/*

+# Install modelscope (for fast download) and ray (for multinode)
+RUN pip config set global.index-url ${PIP_INDEX_URL} && \
+    python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.16.0
-RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
-# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm && \
+    # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+    VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge

 # Install vllm-ascend
+ARG SOC_VERSION="ascend910_9391"
+ENV SOC_VERSION=$SOC_VERSION \
+    TASK_QUEUE_ENABLE=1 \
+    OMP_NUM_THREADS=1
+COPY . /vllm-workspace/vllm-ascend/
+
 RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
@@ -66,15 +71,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge

-# Install clang (for triton-ascend)
-RUN yum update -y && \
-    yum install -y clang && \
-    rm -rf /var/cache/yum/*
-
-# Install modelscope (for fast download) and ray (for multinode)
-RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
-    python3 -m pip cache purge
-
 RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc

 CMD ["/bin/bash"]
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -18,25 +18,18 @@
 FROM quay.io/ascend/cann:8.5.0-910b-openeuler24.03-py3.11

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
-ARG MOONCAKE_TAG="v0.3.8.post1"
-ARG SOC_VERSION="ascend910b1"
-
-ENV SOC_VERSION=$SOC_VERSION \
-    TASK_QUEUE_ENABLE=1 \
-    OMP_NUM_THREADS=1
-
-RUN pip config set global.index-url ${PIP_INDEX_URL}

 WORKDIR /workspace

-COPY . /vllm-workspace/vllm-ascend/
+COPY ./tools/mooncake_installer.sh /vllm-workspace/vllm-ascend/

 SHELL ["/bin/bash", "-c"]

+# Install clang (for triton-ascend) and Mooncake
 RUN yum update -y && \
-    yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc && \
+    yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel jemalloc clang && \
    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
-    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
+    mv /vllm-workspace/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
    ARCH=$(uname -m) && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/devlib:/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/lib64:$LD_LIBRARY_PATH && \
@@ -48,16 +41,28 @@ RUN yum update -y && \
    rm -fr /vllm-workspace/Mooncake/build && \
    rm -rf /var/cache/yum/*

+# Install modelscope (for fast download) and ray (for multinode)
+RUN pip config set global.index-url ${PIP_INDEX_URL} && \
+    python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.16.0
-RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
-# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm && \
+    # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+    VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge

 # Install vllm-ascend
+ARG MOONCAKE_TAG="v0.3.8.post1"
+ARG SOC_VERSION="ascend910b1"
+ENV SOC_VERSION=$SOC_VERSION \
+    TASK_QUEUE_ENABLE=1 \
+    OMP_NUM_THREADS=1
+COPY . /vllm-workspace/vllm-ascend/
+
 RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
@@ -66,15 +71,6 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge

-# Install clang (for triton-ascend)
-RUN yum update -y && \
-    yum install -y clang && \
-    rm -rf /var/cache/yum/*
-
-# Install modelscope (for fast download) and ray (for multinode)
-RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
-    python3 -m pip cache purge
-
 RUN echo "export LD_PRELOAD=/usr/lib64/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc

 CMD ["/bin/bash"]