diff --git a/.github/workflows/release-docker-dev.yml b/.github/workflows/release-docker-dev.yml
index 561309ff3..746f641ae 100644
--- a/.github/workflows/release-docker-dev.yml
+++ b/.github/workflows/release-docker-dev.yml
@@ -6,7 +6,7 @@ on:
     - cron: '0 0 * * *'
 
 jobs:
-  build-dev:
+  build-dev-x86:
     if: ${{ github.repository == 'sgl-project/sglang' }}
     runs-on: ubuntu-22.04
     strategy:
@@ -15,6 +15,46 @@ jobs:
           - version: 12.9.1
             type: all
             tag: dev
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: false
+          docker-images: false
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push Dev Image (x86)
+        run: |
+          docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache .
+
+  build-blackwell-x86:
+    if: ${{ github.repository == 'sgl-project/sglang' }}
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        variant:
+          - version: 12.8.1
+            type: blackwell
+            tag: blackwell
+          - version: 12.9.1
+            type: blackwell
+            tag: blackwell-cu129
 
     steps:
       - name: Checkout repository
@@ -31,13 +71,79 @@ jobs:
           large-packages: true
           swap-storage: false
 
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
       - name: Login to Docker Hub
         uses: docker/login-action@v2
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
 
-      - name: Build and Push Dev Image
+      - name: Build and Push Blackwell Image (x86)
         run: |
-          docker buildx build --output type=image,compression=zstd . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache
-          docker push lmsysorg/sglang:${{ matrix.variant.tag }}
+          if [ "${{ matrix.variant.version }}" = "12.9.1" ]; then
+            docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }}-x86 --no-cache .
+          else
+            docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache .
+          fi
+
+  build-blackwell-arm:
+    if: ${{ github.repository == 'sgl-project/sglang' }}
+    runs-on: ubuntu-22.04-arm
+    strategy:
+      matrix:
+        variant:
+          - version: 12.9.1
+            type: blackwell_aarch
+            tag: blackwell-cu129
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: false
+          docker-images: false
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push Blackwell Image (ARM)
+        run: |
+          docker buildx build --platform linux/arm64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }}-arm64 --no-cache .
+
+
+  create-manifests:
+    runs-on: ubuntu-22.04
+    needs: [build-blackwell-x86, build-blackwell-arm]
+    if: ${{ github.repository == 'sgl-project/sglang' }}
+    strategy:
+      matrix:
+        variant:
+          - tag: blackwell-cu129
+            x86_tag: blackwell-cu129-x86
+            arm64_tag: blackwell-cu129-arm64
+    steps:
+      - uses: docker/setup-buildx-action@v3
+      - uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - run: |
+          docker buildx imagetools create \
+            -t lmsysorg/sglang:${{ matrix.variant.tag }} \
+            lmsysorg/sglang:${{ matrix.variant.x86_tag }} \
+            lmsysorg/sglang:${{ matrix.variant.arm64_tag }}
diff --git a/.github/workflows/release-docker-gb200.yml b/.github/workflows/release-docker-gb200.yml
index 87de03b85..c740d38a4 100644
--- a/.github/workflows/release-docker-gb200.yml
+++ b/.github/workflows/release-docker-gb200.yml
@@ -1,10 +1,5 @@
 name: Release Docker Images (GB200)
 on:
-  push:
-    branches:
-      - main
-    paths:
-      - "python/sglang/version.py"
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
index 711cad46b..3d3c7e5e7 100644
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@@ -8,14 +8,19 @@ on:
   workflow_dispatch:
 
 jobs:
-  publish:
+  publish-x86:
     if: github.repository == 'sgl-project/sglang'
-    runs-on: ubuntu-latest
     environment: 'prod'
     strategy:
       matrix:
-        cuda_version: ['12.6.1', '12.9.1']
-        build_type: ['all']
+        variant:
+          - cuda_version: '12.6.1'
+            build_type: 'all'
+          - cuda_version: '12.8.1'
+            build_type: 'blackwell'
+          - cuda_version: '12.9.1'
+            build_type: 'blackwell'
+    runs-on: ubuntu-latest
     steps:
       - name: Delete huge unnecessary tools folder
         run: rm -rf /opt/hostedtoolcache
@@ -34,29 +39,24 @@ jobs:
           large-packages: true
           swap-storage: false
 
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
       - name: Login to Docker Hub
         uses: docker/login-action@v2
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
 
-      - name: Build and Push
+      - name: Build and Push AMD64
         run: |
           version=$(cat python/sglang/version.py | cut -d'"' -f2)
 
-          if [ "${{ matrix.cuda_version }}" = "11.8.0" ]; then
-            cuda_tag="cu118"
-          elif [ "${{ matrix.cuda_version }}" = "12.1.1" ]; then
-            cuda_tag="cu121"
-          elif [ "${{ matrix.cuda_version }}" = "12.4.1" ]; then
-            cuda_tag="cu124"
-          elif [ "${{ matrix.cuda_version }}" = "12.5.1" ]; then
-            cuda_tag="cu125"
-          elif [ "${{ matrix.cuda_version }}" = "12.6.1" ]; then
+          if [ "${{ matrix.variant.cuda_version }}" = "12.6.1" ]; then
             cuda_tag="cu126"
-          elif [ "${{ matrix.cuda_version }}" = "12.8.1" ]; then
+          elif [ "${{ matrix.variant.cuda_version }}" = "12.8.1" ]; then
             cuda_tag="cu128"
-          elif [ "${{ matrix.cuda_version }}" = "12.9.1" ]; then
+          elif [ "${{ matrix.variant.cuda_version }}" = "12.9.1" ]; then
             cuda_tag="cu129"
           else
             echo "Unsupported CUDA version"
@@ -65,25 +65,58 @@ jobs:
 
           tag=v${version}-${cuda_tag}
 
-          if [ "${{ matrix.build_type }}" = "all" ]; then
+          if [ "${{ matrix.variant.build_type }}" = "all" ]; then
             tag_suffix=""
-          elif [ "${{ matrix.build_type }}" = "srt" ]; then
-            tag_suffix="-srt"
-          elif [ "${{ matrix.build_type }}" = "blackwell" ]; then
+          elif [ "${{ matrix.variant.build_type }}" = "blackwell" ]; then
             tag_suffix="-b200"
           else
             echo "Unsupported build type"
             exit 1
           fi
 
-          docker buildx build --output type=image,compression=zstd . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache
-
-          if [ "${{ matrix.cuda_version }}" = "12.6.1" ]; then
-            docker tag lmsysorg/sglang:${tag}${tag_suffix} lmsysorg/sglang:latest${tag_suffix}
-            docker push lmsysorg/sglang:latest${tag_suffix}
+          if [ "${{ matrix.variant.cuda_version }}" = "12.9.1" ]; then
+            docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} -t lmsysorg/sglang:latest --no-cache .
+          else
+            docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache .
           fi
 
-          if [ "${{ matrix.cuda_version }}" = "12.9.1" ]; then
-            docker tag lmsysorg/sglang:${tag}${tag_suffix} lmsysorg/sglang:v${version}
-            docker push lmsysorg/sglang:v${version}
+  publish-arm64:
+    if: github.repository == 'sgl-project/sglang'
+    environment: 'prod'
+    strategy:
+      matrix:
+        variant:
+          - cuda_version: '12.9.1'
+            build_type: 'blackwell_aarch'
+    runs-on: ubuntu-22.04-arm
+    steps:
+      - name: Delete huge unnecessary tools folder
+        run: rm -rf /opt/hostedtoolcache
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push ARM64
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+
+          if [ "${{ matrix.variant.cuda_version }}" = "12.9.1" ]; then
+            cuda_tag="cu129"
+          else
+            echo "Unsupported CUDA version"
+            exit 1
           fi
+
+          tag=v${version}-${cuda_tag}
+          tag_suffix="-gb200"
+
+          docker buildx build --platform linux/arm64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache .
diff --git a/.github/workflows/release-whl-kernel.yml b/.github/workflows/release-whl-kernel.yml
index ffc8eb450..75f94996f 100644
--- a/.github/workflows/release-whl-kernel.yml
+++ b/.github/workflows/release-whl-kernel.yml
@@ -44,7 +44,7 @@ jobs:
         working-directory: sgl-kernel
         run: |
           pip install twine
-          python3 -m twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
+          python3 -m twine upload --skip-existing dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
 
   build-cu124:
     if: github.repository == 'sgl-project/sglang'
@@ -227,6 +227,12 @@ jobs:
           chmod +x ./build.sh
           ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}" aarch64
 
+      - name: Upload to PyPI
+        working-directory: sgl-kernel
+        run: |
+          pip install twine
+          python3 -m twine upload --skip-existing dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
+
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 2f4c22d90..45c6e5dd7 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,10 +1,11 @@
 ARG CUDA_VERSION=12.9.1
-FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 as base
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 AS base
 
 ARG BUILD_TYPE=all
 ARG BRANCH_TYPE=remote
 ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
 ARG CMAKE_BUILD_PARALLEL_LEVEL=2
+ARG SGL_KERNEL_VERSION=0.3.12
 ENV DEBIAN_FRONTEND=noninteractive \
     CUDA_HOME=/usr/local/cuda \
     GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ \
@@ -57,7 +58,7 @@ RUN mkdir -p /tmp/gdrcopy && cd /tmp \
  && cd / && rm -rf /tmp/gdrcopy
 
 # Fix DeepEP IBGDA symlink
-RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
+RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so
 
 FROM scratch AS local_src
 COPY . /src
@@ -81,24 +82,33 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
       12.9.1) CUINDEX=129 ;; \
       *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
     esac \
+ && if [ "$CUDA_VERSION" = "12.6.1" ]; then \
+     python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
+   fi \
+&& if [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ]; then \
+     python3 -m pip install --no-cache-dir sgl-kernel==${SGL_KERNEL_VERSION} ; \
+   fi \
  && python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
  && python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps \
- && python3 -m flashinfer --download-cubin \
- && if [ "$CUDA_VERSION" = "12.6.1" ]; then \
-      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.11/sgl_kernel-0.3.11+cu124-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
-    fi
+ && python3 -m flashinfer --download-cubin
+
 
 # Download source files
 RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
-    git clone https://github.com/deepseek-ai/DeepEP.git && \
-    cd DeepEP && git checkout ${DEEPEP_COMMIT} && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \
-    cd .. && \
-    tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
-    mv nvshmem_src nvshmem && \
-    rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+    if [ "$BUILD_TYPE" = "blackwell" ] && [ "$(uname -m)" = "aarch64" ]; then \
+      git clone https://github.com/fzyzcjy/DeepEP.git \
+      && cd DeepEP && git checkout 1b14ad661c7640137fcfe93cccb2694ede1220b0 && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \
+    else \
+      git clone https://github.com/deepseek-ai/DeepEP.git \
+      && cd DeepEP && git checkout ${DEEPEP_COMMIT} && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \
+    fi \
+    && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
+    && mv nvshmem_src nvshmem \
+    && rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
 
 # Build and install NVSHMEM
 RUN cd /sgl-workspace/nvshmem && \
+    if [ "$BUILD_TYPE" = "blackwell" ] || [ "$BUILD_TYPE" = "blackwell_aarch" ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \
     NVSHMEM_SHMEM_SUPPORT=0 \
     NVSHMEM_UCX_SUPPORT=0 \
     NVSHMEM_USE_NCCL=0 \
@@ -107,7 +117,7 @@ RUN cd /sgl-workspace/nvshmem && \
     NVSHMEM_PMIX_SUPPORT=0 \
     NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
     NVSHMEM_USE_GDRCOPY=1 \
-    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="90" && \
+    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} && \
     cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL}
 
 # Install DeepEP
@@ -125,6 +135,7 @@ RUN cd /sgl-workspace/DeepEP && \
     esac && \
     NVSHMEM_DIR=${NVSHMEM_DIR} TORCH_CUDA_ARCH_LIST="${CHOSEN_TORCH_CUDA_ARCH_LIST}" pip install .
 
+
 # Python tools
 RUN python3 -m pip install --no-cache-dir \
     datamodel_code_generator \
@@ -169,16 +180,16 @@ RUN apt-get update && apt-get install -y \
 
 RUN apt update -y \
     && apt install -y --no-install-recommends gnupg \
-    && echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64 /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \
-    && apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub \
+    && echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2004/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \
+    && apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "x86_64"; fi)/7fa2af80.pub \
     && apt update -y \
     && apt install nsight-systems-cli -y
 
 # Set up locale
 RUN locale-gen en_US.UTF-8
-ENV LANG en_US.UTF-8
-ENV LANGUAGE en_US:en
-ENV LC_ALL en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US:en
+ENV LC_ALL=en_US.UTF-8
 
 # Install minimal Python packages
 RUN python3 -m pip install --no-cache-dir --break-system-packages \
@@ -186,7 +197,7 @@ RUN python3 -m pip install --no-cache-dir --break-system-packages \
     black \
     isort \
     icdiff \
-    scikit_build_core \
+    scikit-build-core \
     uv \
     pre-commit \
     pandas \
@@ -209,11 +220,14 @@ RUN curl -L https://github.com/clangd/clangd/releases/download/18.1.3/clangd-lin
     && rm -rf clangd_18.1.3 clangd.zip
 
 # Install CMake
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.31.1/cmake-3.31.1-linux-x86_64.tar.gz \
-    && tar -xzf cmake-3.31.1-linux-x86_64.tar.gz \
-    && cp -r cmake-3.31.1-linux-x86_64/bin/* /usr/local/bin/ \
-    && cp -r cmake-3.31.1-linux-x86_64/share/* /usr/local/share/ \
-    && rm -rf cmake-3.31.1-linux-x86_64 cmake-3.31.1-linux-x86_64.tar.gz
+RUN CMAKE_VERSION=3.31.1 \
+    && ARCH=$(uname -m) \
+    && CMAKE_INSTALLER="cmake-${CMAKE_VERSION}-linux-${ARCH}" \
+    && wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_INSTALLER}.tar.gz" \
+    && tar -xzf "${CMAKE_INSTALLER}.tar.gz" \
+    && cp -r "${CMAKE_INSTALLER}/bin/"* /usr/local/bin/ \
+    && cp -r "${CMAKE_INSTALLER}/share/"* /usr/local/share/ \
+    && rm -rf "${CMAKE_INSTALLER}" "${CMAKE_INSTALLER}.tar.gz"
 
 # Install Rust toolchain for sgl-router
 ENV PATH="/root/.cargo/bin:${PATH}"