115 lines
4.2 KiB
Docker
115 lines
4.2 KiB
Docker
ARG BASE_IMAGE
|
|
FROM ${BASE_IMAGE}
|
|
# Deps
|
|
RUN apt-get update && apt-get install -y netcat-openbsd \
|
|
libopenmpi-dev \
|
|
kmod \
|
|
rdma-core \
|
|
infiniband-diags \
|
|
openssh-server \
|
|
perftest \
|
|
ibverbs-providers \
|
|
libibumad3 \
|
|
libibverbs1 \
|
|
libnl-3-200 \
|
|
libnl-route-3-200 \
|
|
librdmacm1 \
|
|
build-essential \
|
|
cmake \
|
|
libibverbs-dev \
|
|
libgoogle-glog-dev \
|
|
libgtest-dev \
|
|
libjsoncpp-dev \
|
|
libnuma-dev \
|
|
libibverbs-dev \
|
|
libunwind-dev \
|
|
libgoogle-glog-dev \
|
|
libpython3-dev \
|
|
libboost-all-dev \
|
|
libssl-dev \
|
|
libgrpc-dev \
|
|
libgrpc++-dev \
|
|
libprotobuf-dev \
|
|
protobuf-compiler-grpc \
|
|
pybind11-dev \
|
|
libhiredis-dev \
|
|
pkg-config \
|
|
patchelf \
|
|
ccache \
|
|
libcurl4-openssl-dev \
|
|
curl \
|
|
pkg-config libczmq4 libczmq-dev \
|
|
libnl-route-3-dev libnl-3-dev librdmacm1 \
|
|
libhiredis-dev \
|
|
nvidia-dkms-535 \
|
|
build-essential \
|
|
devscripts \
|
|
debhelper \
|
|
fakeroot \
|
|
dkms \
|
|
check \
|
|
libsubunit0 \
|
|
libsubunit-dev \
|
|
libfabric-dev \
|
|
python3 \
|
|
python3-pip \
|
|
&& rm -rf /var/lib/apt/lists/* \
|
|
&& ln -s /usr/bin/python3 /usr/bin/python
|
|
|
|
# CMake
|
|
RUN wget https://github.com/Kitware/CMake/releases/download/v3.27.4/cmake-3.27.4-linux-x86_64.sh \
|
|
&& chmod +x cmake-3.27.4-linux-x86_64.sh \
|
|
&& ./cmake-3.27.4-linux-x86_64.sh --skip-license --prefix=/usr/local \
|
|
&& rm cmake-3.27.4-linux-x86_64.sh
|
|
|
|
|
|
ENV GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
|
|
# GDRCopy
|
|
RUN mkdir -p /tmp \
|
|
&& cd /tmp \
|
|
&& git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
|
|
&& cd /tmp/gdrcopy/packages \
|
|
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
|
|
&& dpkg -i gdrdrv-dkms_*.deb \
|
|
&& dpkg -i libgdrapi_*.deb \
|
|
&& dpkg -i gdrcopy-tests_*.deb \
|
|
&& dpkg -i gdrcopy_*.deb
|
|
|
|
|
|
# IBGDA dependency
|
|
RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
|
|
|
|
# DeepEP
|
|
WORKDIR /sgl-workspace
|
|
RUN git clone https://github.com/deepseek-ai/DeepEP.git
|
|
|
|
# NVSHMEM
|
|
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
|
|
RUN tar -xf nvshmem_src_3.2.5-1.txz \
|
|
&& mv nvshmem_src nvshmem \
|
|
&& cd /sgl-workspace/nvshmem \
|
|
&& git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch \
|
|
&& sed -i '1i#include <unistd.h>' /sgl-workspace/nvshmem/examples/moe_shuffle.cu \
|
|
&& cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu
|
|
|
|
# Compile NVSHMEM
|
|
ENV CUDA_HOME=/usr/local/cuda
|
|
RUN cd /sgl-workspace/nvshmem && NVSHMEM_SHMEM_SUPPORT=0 \
|
|
NVSHMEM_UCX_SUPPORT=0 \
|
|
NVSHMEM_USE_NCCL=0 \
|
|
NVSHMEM_MPI_SUPPORT=0 \
|
|
NVSHMEM_IBGDA_SUPPORT=1 \
|
|
NVSHMEM_PMIX_SUPPORT=0 \
|
|
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
|
|
NVSHMEM_USE_GDRCOPY=1 \
|
|
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/sgl-workspace/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \
|
|
&& cd build \
|
|
&& make install -j
|
|
|
|
WORKDIR /sgl-workspace/DeepEP
|
|
ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install
|
|
RUN NVSHMEM_DIR=/sgl-workspace/nvshmem/install pip install --break-system-packages .
|
|
|
|
# Install mooncake transfer engine
|
|
RUN pip install --upgrade mooncake_transfer_engine --break-system-packages
|