diff --git a/.github/workflows/release-docker-dev.yml b/.github/workflows/release-docker-dev.yml
index b47f998c2..4b4701099 100644
--- a/.github/workflows/release-docker-dev.yml
+++ b/.github/workflows/release-docker-dev.yml
@@ -3,7 +3,7 @@ name: Build and Push Development Docker Images
 on:
   workflow_dispatch:
   schedule:
-    - cron: '0 0 * * *'
+    - cron: "0 0 * * *"
 
 jobs:
   build-dev-x86:
@@ -14,7 +14,7 @@ jobs:
         variant:
           - version: 12.9.1
             type: all
-            tag: dev
+            tag: dev-x86
     steps:
       - name: Delete huge unnecessary tools folder
         run: rm -rf /opt/hostedtoolcache
@@ -46,15 +46,15 @@ jobs:
         run: |
           docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache .
 
-  build-blackwell-arm:
+  build-dev-arm:
     if: ${{ github.repository == 'sgl-project/sglang' }}
-    runs-on: labubu
+    runs-on: sgl-kernel-release-node-arm
     strategy:
       matrix:
         variant:
           - version: 12.9.1
-            type: blackwell_aarch
-            tag: blackwell-cu129
+            type: blackwell_aarch64
+            tag: dev-arm64
     steps:
       - name: Delete huge unnecessary tools folder
         run: rm -rf /opt/hostedtoolcache
@@ -84,19 +84,18 @@ jobs:
 
       - name: Build and Push Blackwell Image (ARM)
         run: |
-          docker buildx build --platform linux/arm64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }}-arm64 --no-cache .
-
+          docker buildx build --platform linux/arm64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache .
 
   create-manifests:
     runs-on: ubuntu-22.04
-    needs: [build-dev-x86, build-blackwell-arm]
+    needs: [build-dev-x86, build-dev-arm]
     if: ${{ github.repository == 'sgl-project/sglang' }}
     strategy:
       matrix:
         variant:
-          - tag: dev-manifest
-            x86_tag: dev
-            arm64_tag: blackwell-cu129-arm64
+          - tag: dev
+            x86_tag: dev-x86
+            arm64_tag: dev-arm64
     steps:
       - uses: docker/setup-buildx-action@v3
       - uses: docker/login-action@v2
diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
index 811fedb26..7b5a6dda7 100644
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@@ -10,16 +10,16 @@ on:
 jobs:
   publish-x86:
     if: github.repository == 'sgl-project/sglang'
-    environment: 'prod'
+    environment: "prod"
     strategy:
       matrix:
         variant:
-          - cuda_version: '12.6.1'
-            build_type: 'all'
-          - cuda_version: '12.8.1'
-            build_type: 'blackwell'
-          - cuda_version: '12.9.1'
-            build_type: 'blackwell'
+          - cuda_version: "12.6.1"
+            build_type: "all"
+          - cuda_version: "12.8.1"
+            build_type: "blackwell"
+          - cuda_version: "12.9.1"
+            build_type: "blackwell"
     runs-on: nvidia
     steps:
       - name: Delete huge unnecessary tools folder
@@ -82,13 +82,13 @@ jobs:
 
   publish-arm64:
     if: github.repository == 'sgl-project/sglang'
-    environment: 'prod'
+    environment: "prod"
     strategy:
       matrix:
         variant:
-          - cuda_version: '12.9.1'
-            build_type: 'blackwell_aarch'
-    runs-on: labubu
+          - cuda_version: "12.9.1"
+            build_type: "blackwell_aarch64"
+    runs-on: sgl-kernel-release-node-arm
     steps:
       - name: Delete huge unnecessary tools folder
         run: rm -rf /opt/hostedtoolcache
diff --git a/.github/workflows/release-whl-kernel.yml b/.github/workflows/release-whl-kernel.yml
index 75f94996f..c80fd1fd1 100644
--- a/.github/workflows/release-whl-kernel.yml
+++ b/.github/workflows/release-whl-kernel.yml
@@ -206,7 +206,7 @@ jobs:
 
   build-cu129-aarch64:
     if: github.repository == 'sgl-project/sglang'
-    runs-on: labubu
+    runs-on: sgl-kernel-release-node-arm
     strategy:
       matrix:
         python-version: ["3.10"]
diff --git a/docker/Dockerfile b/docker/Dockerfile
index e774b1f57..a6229288d 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -93,9 +93,10 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
  && FLASHINFER_LOGGING_LEVEL=warning python3 -m flashinfer --download-cubin
 
 
-# Download source files
+# Download NVSHMEM source files
+# We use Tom's DeepEP fork for GB200 for now
 RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
-    if [ "$BUILD_TYPE" = "blackwell_aarch" ] && [ "$(uname -m)" = "aarch64" ]; then \
+    if [ "$BUILD_TYPE" = "blackwell_aarch64" ]; then \
       git clone https://github.com/fzyzcjy/DeepEP.git \
       && cd DeepEP && git checkout 1b14ad661c7640137fcfe93cccb2694ede1220b0 && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \
     else \
@@ -108,7 +109,7 @@ RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/sour
 
 # Build and install NVSHMEM
 RUN cd /sgl-workspace/nvshmem && \
-    if [ "$BUILD_TYPE" = "blackwell" ] || [ "$BUILD_TYPE" = "blackwell_aarch" ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \
+    export CUDA_ARCH="90;100;120" && \
     NVSHMEM_SHMEM_SUPPORT=0 \
     NVSHMEM_UCX_SUPPORT=0 \
     NVSHMEM_USE_NCCL=0 \