chore(gb200): update to CUDA 12.9 and improve build process (#8772)

2025-08-08 13:42:47 -07:00
parent 36bfddecb9
commit 4e7f025219
7 changed files with 86 additions and 50 deletions
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -79,14 +79,17 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.3/sgl_kernel-0.3.3-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
    fi

-# Build and install NVSHMEM + DeepEP
-RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
- && git clone https://github.com/deepseek-ai/DeepEP.git \
- && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. \
- && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \
- && cd nvshmem \
- && rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
- && NVSHMEM_SHMEM_SUPPORT=0 \
+# Download source files
+RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
+    git clone https://github.com/deepseek-ai/DeepEP.git && \
+    cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. && \
+    tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
+    mv nvshmem_src nvshmem && \
+    rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+
+# Build and install NVSHMEM
+RUN cd /sgl-workspace/nvshmem && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
    NVSHMEM_UCX_SUPPORT=0 \
    NVSHMEM_USE_NCCL=0 \
    NVSHMEM_MPI_SUPPORT=0 \
@@ -94,10 +97,12 @@ RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/sour
    NVSHMEM_PMIX_SUPPORT=0 \
    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
    NVSHMEM_USE_GDRCOPY=1 \
-    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES=90 \
- && cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL} \
- && cd /sgl-workspace/DeepEP \
- && NVSHMEM_DIR=${NVSHMEM_DIR} pip install .
+    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" && \
+    cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL}
+
+# Install DeepEP
+RUN cd /sgl-workspace/DeepEP && \
+    NVSHMEM_DIR=${NVSHMEM_DIR} pip install .

 # Python tools
 RUN python3 -m pip install --no-cache-dir \
@@ -110,7 +115,8 @@ RUN python3 -m pip install --no-cache-dir \
    icdiff \
    uv \
    wheel \
-    scikit-build-core
+    scikit-build-core \
+    nixl

 # Install development tools and utilities
 RUN apt-get update && apt-get install -y \
--- a/docker/Dockerfile.gb200
+++ b/docker/Dockerfile.gb200
@@ -1,4 +1,4 @@
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.9.1
 FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04

 ARG BUILD_TYPE=blackwell
@@ -38,7 +38,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 && rm -rf /var/lib/apt/lists/* \
 && apt-get clean

-# --- Install SGLang missing package for blackwell build type
+# Install SGLang missing package for blackwell build type
 RUN python3 -m pip install openai httpx

 # GDRCopy installation
@@ -60,33 +60,39 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
 && case "$CUDA_VERSION" in \
      12.6.1) CUINDEX=126 ;; \
      12.8.1) CUINDEX=128 ;; \
+      12.9.1) CUINDEX=129 ;; \
      *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
    esac \
 && python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
- && if [ "$CUDA_VERSION" = "12.8.1" ]; then \
+ && if [ "$CUDA_VERSION" = "12.9.1" ]; then \
      python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps ; \
-      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.2.7/sgl_kernel-0.2.7+cu128-cp39-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
+      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.3/sgl_kernel-0.3.3+cu129-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
    fi

-    # Build and install NVSHMEM + DeepEP
-RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
-&& git clone https://github.com/fzyzcjy/DeepEP.git \
-&& cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. \
-&& tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \
-&& cd nvshmem \
-&& rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
-&& NVSHMEM_SHMEM_SUPPORT=0 \
-   NVSHMEM_UCX_SUPPORT=0 \
-   NVSHMEM_USE_NCCL=0 \
-   NVSHMEM_MPI_SUPPORT=0 \
-   NVSHMEM_IBGDA_SUPPORT=1 \
-   NVSHMEM_PMIX_SUPPORT=0 \
-   NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
-   NVSHMEM_USE_GDRCOPY=1 \
-   cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" \
-&& cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL} \
-&& cd /sgl-workspace/DeepEP \
-&& NVSHMEM_DIR=${NVSHMEM_DIR} pip install .
+# Download source files
+RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
+    git clone https://github.com/fzyzcjy/DeepEP.git && \
+    cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. && \
+    tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
+    mv nvshmem_src nvshmem && \
+    rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+
+# Build and install NVSHMEM
+RUN cd /sgl-workspace/nvshmem && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_USE_GDRCOPY=1 \
+    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" && \
+    cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL}
+
+# Install DeepEP
+RUN cd /sgl-workspace/DeepEP && \
+    NVSHMEM_DIR=${NVSHMEM_DIR} pip install .

 # Python tools
 RUN python3 -m pip install --no-cache-dir \
@@ -106,7 +112,7 @@ RUN python3 -m pip install --no-cache-dir \
    nvidia-cudnn-cu12 \
    nvidia-cudnn-frontend

-# Allows for FP4 disaggregation
+# Install nixl kv transfer backend
 RUN python3 -m pip install --no-cache-dir \
    nixl

@@ -163,6 +169,12 @@ RUN python3 -m pip install --no-cache-dir --break-system-packages \
    matplotlib \
    tabulate

+# Install flashinfer from source to fix a bug
+# https://github.com/flashinfer-ai/flashinfer/pull/1413
+# FIXME: remove this once flashinfer release > 0.2.10
+WORKDIR /sgl-workspace
+RUN git clone https://github.com/flashinfer-ai/flashinfer.git --recursive && cd flashinfer && python3 -m pip install -v .
+
 # Install diff-so-fancy
 RUN curl -LSso /usr/local/bin/diff-so-fancy https://github.com/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \
    && chmod +x /usr/local/bin/diff-so-fancy