Fix Dockerfile not installing correct version of DeepEP for arm build (#11773)

This commit is contained in:
kyleliang-nv
2025-10-18 15:06:05 -07:00
committed by GitHub
parent ebda73dc72
commit fda0cb2a30
3 changed files with 10 additions and 3 deletions

View File

@@ -2,6 +2,7 @@ ARG CUDA_VERSION=12.9.1
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 AS base
ARG TARGETARCH
ARG GRACE_BLACKWELL=0
ARG BUILD_TYPE=all
ARG BRANCH_TYPE=remote
ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
@@ -99,7 +100,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
# Download NVSHMEM source files
# We use Tom's DeepEP fork for GB200 for now
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
if [ "$BUILD_TYPE" = "blackwell_aarch64" ]; then \
if [ "$GRACE_BLACKWELL" = "1" ]; then \
git clone https://github.com/fzyzcjy/DeepEP.git \
&& cd DeepEP && git checkout 1b14ad661c7640137fcfe93cccb2694ede1220b0 && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \
else \
@@ -112,7 +113,7 @@ RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/sour
# Build and install NVSHMEM
RUN cd /sgl-workspace/nvshmem && \
if [ "$BUILD_TYPE" = "blackwell" ] || [ "$BUILD_TYPE" = "blackwell_aarch" ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \
if [ "$GRACE_BLACKWELL" = "1" ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \