From fda0cb2a304392d88999cfdfad3b1df38407fc47 Mon Sep 17 00:00:00 2001 From: kyleliang-nv Date: Sat, 18 Oct 2025 15:06:05 -0700 Subject: [PATCH] Fix Dockerfile not installing correct version of DeepEP for arm build (#11773) --- .github/workflows/release-docker-dev.yml | 4 +++- .github/workflows/release-docker.yml | 4 ++++ docker/Dockerfile | 5 +++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release-docker-dev.yml b/.github/workflows/release-docker-dev.yml index 3e1a6090d..13542af63 100644 --- a/.github/workflows/release-docker-dev.yml +++ b/.github/workflows/release-docker-dev.yml @@ -15,11 +15,13 @@ jobs: - runner: x64-docker-build-node platform: linux/amd64 build_type: all + grace_blackwell: 0 tag: dev-x86 version: 12.9.1 - runner: arm-docker-build-node platform: linux/arm64 build_type: all + grace_blackwell: 1 tag: dev-arm64 version: 12.9.1 steps: @@ -51,7 +53,7 @@ jobs: - name: Build and Push Dev Image run: | - docker buildx build --platform ${{ matrix.platform }} --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.tag }} --no-cache . + docker buildx build --platform ${{ matrix.platform }} --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GRACE_BLACKWELL=${{ matrix.grace_blackwell }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.tag }} --no-cache . create-manifests: runs-on: ubuntu-22.04 diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 30e64b014..596033854 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -16,6 +16,7 @@ jobs: variant: - cuda_version: "12.9.1" build_type: "all" + grace_blackwell: 0 runs-on: x64-docker-build-node steps: - name: Delete huge unnecessary tools folder @@ -55,6 +56,7 @@ jobs: -f docker/Dockerfile \ --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} \ --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} \ + --build-arg GRACE_BLACKWELL=${{ matrix.variant.grace_blackwell }} \ -t lmsysorg/sglang:${tag} \ --no-cache \ . @@ -67,6 +69,7 @@ jobs: variant: - cuda_version: "12.9.1" build_type: "all" + grace_blackwell: 1 runs-on: arm-docker-build-node steps: - name: Delete huge unnecessary tools folder @@ -95,6 +98,7 @@ jobs: -f docker/Dockerfile \ --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} \ --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} \ + --build-arg GRACE_BLACKWELL=${{ matrix.variant.grace_blackwell }} \ -t lmsysorg/sglang:${tag} \ --no-cache \ . diff --git a/docker/Dockerfile b/docker/Dockerfile index dfd042929..3368919b0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,6 +2,7 @@ ARG CUDA_VERSION=12.9.1 FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 AS base ARG TARGETARCH +ARG GRACE_BLACKWELL=0 ARG BUILD_TYPE=all ARG BRANCH_TYPE=remote ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee @@ -99,7 +100,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li # Download NVSHMEM source files # We use Tom's DeepEP fork for GB200 for now RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \ - if [ "$BUILD_TYPE" = "blackwell_aarch64" ]; then \ + if [ "$GRACE_BLACKWELL" = "1" ]; then \ git clone https://github.com/fzyzcjy/DeepEP.git \ && cd DeepEP && git checkout 1b14ad661c7640137fcfe93cccb2694ede1220b0 && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \ else \ @@ -112,7 +113,7 @@ RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/sour # Build and install NVSHMEM RUN cd /sgl-workspace/nvshmem && \ - if [ "$BUILD_TYPE" = "blackwell" ] || [ "$BUILD_TYPE" = "blackwell_aarch" ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \ + if [ "$GRACE_BLACKWELL" = "1" ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \ NVSHMEM_SHMEM_SUPPORT=0 \ NVSHMEM_UCX_SUPPORT=0 \ NVSHMEM_USE_NCCL=0 \