diff --git a/.github/workflows/release-docker-gb200.yml b/.github/workflows/release-docker-gb200.yml
index 9f7c6cf73..fbcacb330 100644
--- a/.github/workflows/release-docker-gb200.yml
+++ b/.github/workflows/release-docker-gb200.yml
@@ -11,7 +11,7 @@ jobs:
   publish:
     if: github.repository == 'sgl-project/sglang'
     runs-on: ubuntu-22.04-arm
-    environment: 'prod'
+    environment: "prod"
     steps:
       - name: Delete huge unnecessary tools folder
         run: rm -rf /opt/hostedtoolcache
@@ -31,6 +31,6 @@ jobs:
       - name: Build and Push
         run: |
           version=$(cat python/sglang/version.py | cut -d'"' -f2)
-          tag=v${version}-cu128-gb200
+          tag=v${version}-cu129-gb200
 
-          docker buildx build --platform linux/arm64 --push --output type=image -t lmsysorg/sglang:${tag} -f docker/Dockerfile.gb200 --build-arg CUDA_VERSION=12.8.1 --build-arg BUILD_TYPE=blackwell --no-cache .
+          docker buildx build --platform linux/arm64 --push --output type=image -t lmsysorg/sglang:${tag} -f docker/Dockerfile.gb200 --build-arg CUDA_VERSION=12.9.1 --build-arg BUILD_TYPE=blackwell --no-cache .
diff --git a/.github/workflows/release-whl-kernel-aarch64.yml b/.github/workflows/release-whl-kernel-aarch64.yml
index 8e94fe8fb..5c0c09794 100644
--- a/.github/workflows/release-whl-kernel-aarch64.yml
+++ b/.github/workflows/release-whl-kernel-aarch64.yml
@@ -17,17 +17,17 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build-cu128-aarch64:
+  build-cu129-aarch64:
     if: github.repository == 'sgl-project/sglang'
-    runs-on: sgl-kernel-release-node
+    runs-on: sgl-kernel-release-node-arm
     strategy:
       matrix:
-        python-version: ['3.9']
-        cuda-version: ['12.8']
+        python-version: ["3.10"]
+        cuda-version: ["12.9"]
     steps:
       - uses: actions/checkout@v4
         with:
-          submodules: 'recursive'
+          submodules: "recursive"
 
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v5
@@ -47,7 +47,7 @@ jobs:
           path: sgl-kernel/dist/*
 
   release:
-    needs: build-cu128-aarch64
+    needs: build-cu129-aarch64
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -84,7 +84,7 @@ jobs:
           WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
 
       - name: Update wheel index
-        run: python3 scripts/update_kernel_whl_index.py --cuda 128
+        run: python3 scripts/update_kernel_whl_index.py --cuda 129
 
       - name: Push wheel index
         run: |
diff --git a/docker/Dockerfile b/docker/Dockerfile
index fbfb7f5c1..67c5e4007 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -79,14 +79,17 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
       python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.3/sgl_kernel-0.3.3-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
     fi
 
-# Build and install NVSHMEM + DeepEP
-RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
- && git clone https://github.com/deepseek-ai/DeepEP.git \
- && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. \
- && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \
- && cd nvshmem \
- && rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
- && NVSHMEM_SHMEM_SUPPORT=0 \
+# Download source files
+RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
+    git clone https://github.com/deepseek-ai/DeepEP.git && \
+    cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. && \
+    tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
+    mv nvshmem_src nvshmem && \
+    rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+
+# Build and install NVSHMEM
+RUN cd /sgl-workspace/nvshmem && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
     NVSHMEM_UCX_SUPPORT=0 \
     NVSHMEM_USE_NCCL=0 \
     NVSHMEM_MPI_SUPPORT=0 \
@@ -94,10 +97,12 @@ RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/sour
     NVSHMEM_PMIX_SUPPORT=0 \
     NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
     NVSHMEM_USE_GDRCOPY=1 \
-    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES=90 \
- && cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL} \
- && cd /sgl-workspace/DeepEP \
- && NVSHMEM_DIR=${NVSHMEM_DIR} pip install .
+    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" && \
+    cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL}
+
+# Install DeepEP
+RUN cd /sgl-workspace/DeepEP && \
+    NVSHMEM_DIR=${NVSHMEM_DIR} pip install .
 
 # Python tools
 RUN python3 -m pip install --no-cache-dir \
@@ -110,7 +115,8 @@ RUN python3 -m pip install --no-cache-dir \
     icdiff \
     uv \
     wheel \
-    scikit-build-core
+    scikit-build-core \
+    nixl
 
 # Install development tools and utilities
 RUN apt-get update && apt-get install -y \
diff --git a/docker/Dockerfile.gb200 b/docker/Dockerfile.gb200
index 0d818692d..ba56b56d5 100644
--- a/docker/Dockerfile.gb200
+++ b/docker/Dockerfile.gb200
@@ -1,4 +1,4 @@
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.9.1
 FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
 
 ARG BUILD_TYPE=blackwell
@@ -38,7 +38,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
  && rm -rf /var/lib/apt/lists/* \
  && apt-get clean
 
-# --- Install SGLang missing package for blackwell build type
+# Install SGLang missing package for blackwell build type
 RUN python3 -m pip install openai httpx
 
 # GDRCopy installation
@@ -60,33 +60,39 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
  && case "$CUDA_VERSION" in \
       12.6.1) CUINDEX=126 ;; \
       12.8.1) CUINDEX=128 ;; \
+      12.9.1) CUINDEX=129 ;; \
       *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
     esac \
  && python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
- && if [ "$CUDA_VERSION" = "12.8.1" ]; then \
+ && if [ "$CUDA_VERSION" = "12.9.1" ]; then \
       python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps ; \
-      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.2.7/sgl_kernel-0.2.7+cu128-cp39-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
+      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.3/sgl_kernel-0.3.3+cu129-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
     fi
 
-    # Build and install NVSHMEM + DeepEP
-RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
-&& git clone https://github.com/fzyzcjy/DeepEP.git \
-&& cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. \
-&& tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \
-&& cd nvshmem \
-&& rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
-&& NVSHMEM_SHMEM_SUPPORT=0 \
-   NVSHMEM_UCX_SUPPORT=0 \
-   NVSHMEM_USE_NCCL=0 \
-   NVSHMEM_MPI_SUPPORT=0 \
-   NVSHMEM_IBGDA_SUPPORT=1 \
-   NVSHMEM_PMIX_SUPPORT=0 \
-   NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
-   NVSHMEM_USE_GDRCOPY=1 \
-   cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" \
-&& cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL} \
-&& cd /sgl-workspace/DeepEP \
-&& NVSHMEM_DIR=${NVSHMEM_DIR} pip install .
+# Download source files
+RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
+    git clone https://github.com/fzyzcjy/DeepEP.git && \
+    cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. && \
+    tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
+    mv nvshmem_src nvshmem && \
+    rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+
+# Build and install NVSHMEM
+RUN cd /sgl-workspace/nvshmem && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_USE_GDRCOPY=1 \
+    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" && \
+    cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL}
+
+# Install DeepEP
+RUN cd /sgl-workspace/DeepEP && \
+    NVSHMEM_DIR=${NVSHMEM_DIR} pip install .
 
 # Python tools
 RUN python3 -m pip install --no-cache-dir \
@@ -106,7 +112,7 @@ RUN python3 -m pip install --no-cache-dir \
     nvidia-cudnn-cu12 \
     nvidia-cudnn-frontend
 
-# Allows for FP4 disaggregation
+# Install nixl kv transfer backend
 RUN python3 -m pip install --no-cache-dir \
     nixl
 
@@ -163,6 +169,12 @@ RUN python3 -m pip install --no-cache-dir --break-system-packages \
     matplotlib \
     tabulate
 
+# Install flashinfer from source to fix a bug
+# https://github.com/flashinfer-ai/flashinfer/pull/1413
+# FIXME: remove this once flashinfer release > 0.2.10
+WORKDIR /sgl-workspace
+RUN git clone https://github.com/flashinfer-ai/flashinfer.git --recursive && cd flashinfer && python3 -m pip install -v .
+
 # Install diff-so-fancy
 RUN curl -LSso /usr/local/bin/diff-so-fancy https://github.com/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \
     && chmod +x /usr/local/bin/diff-so-fancy
diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
index c09a128b5..c9d83303c 100644
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -259,7 +259,7 @@ class Engine(EngineBase):
                     f"data_parallel_rank must be in range [0, {self.server_args.dp_size-1}]"
                 )
 
-        logger.info(f"data_parallel_rank: {data_parallel_rank}")
+        logger.debug(f"data_parallel_rank: {data_parallel_rank}")
         obj = GenerateReqInput(
             text=prompt,
             input_ids=input_ids,
diff --git a/sgl-kernel/build.sh b/sgl-kernel/build.sh
index 8ae22294a..e812c0c7b 100755
--- a/sgl-kernel/build.sh
+++ b/sgl-kernel/build.sh
@@ -39,6 +39,13 @@ docker run --rm \
    # Install CMake (version >= 3.26) - Robust Installation
    export CMAKE_VERSION_MAJOR=3.31
    export CMAKE_VERSION_MINOR=1
+   # Setting these flags to reduce OOM chance only on ARM
+   if [ \"${ARCH}\" = \"aarch64\" ]; then
+      export CUDA_NVCC_FLAGS=\"-Xcudafe --threads=2\"
+      export MAKEFLAGS='-j2'
+      export CMAKE_BUILD_PARALLEL_LEVEL=2
+      export NINJAFLAGS='-j2'
+   fi
    echo \"Downloading CMake from: https://cmake.org/files/v\${CMAKE_VERSION_MAJOR}/cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz\"
    wget https://cmake.org/files/v\${CMAKE_VERSION_MAJOR}/cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz
    tar -xzf cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz
diff --git a/sgl-kernel/rename_wheels.sh b/sgl-kernel/rename_wheels.sh
index b52a94a92..cab79e44e 100755
--- a/sgl-kernel/rename_wheels.sh
+++ b/sgl-kernel/rename_wheels.sh
@@ -7,8 +7,19 @@ wheel_files=($WHEEL_DIR/*.whl)
 for wheel in "${wheel_files[@]}"; do
     intermediate_wheel="${wheel/linux/manylinux2014}"
 
-    if ls /usr/local/ | grep -q "12.8"; then
-        new_wheel="${intermediate_wheel/-cp39/+cu128-cp39}"
+    # Extract the current python version from the wheel name
+    if [[ $intermediate_wheel =~ -cp([0-9]+)- ]]; then
+        cp_version="${BASH_REMATCH[1]}"
+    else
+        echo "Could not extract Python version from wheel name: $intermediate_wheel"
+        continue
+    fi
+
+    # Detect CUDA version and add appropriate suffix
+    if ls /usr/local/ | grep -q "12.9"; then
+        new_wheel="${intermediate_wheel/-cp${cp_version}/+cu129-cp${cp_version}}"
+    elif ls /usr/local/ | grep -q "12.8"; then
+        new_wheel="${intermediate_wheel/-cp${cp_version}/+cu128-cp${cp_version}}"
     else
         new_wheel="$intermediate_wheel"
     fi