[pd] optimize dockerfile for pd disaggregation (#7319)
Co-authored-by: zhyncs <me@zhyncs.com>
This commit is contained in:
@@ -1,67 +1,100 @@
|
||||
ARG BASE_IMAGE
|
||||
FROM ${BASE_IMAGE}
|
||||
# Deps
|
||||
RUN apt-get update && apt-get install -y netcat-openbsd \
|
||||
libopenmpi-dev \
|
||||
kmod \
|
||||
rdma-core \
|
||||
infiniband-diags \
|
||||
openssh-server \
|
||||
perftest \
|
||||
ibverbs-providers \
|
||||
libibumad3 \
|
||||
libibverbs1 \
|
||||
libnl-3-200 \
|
||||
libnl-route-3-200 \
|
||||
librdmacm1 \
|
||||
build-essential \
|
||||
cmake \
|
||||
libibverbs-dev \
|
||||
libgoogle-glog-dev \
|
||||
libgtest-dev \
|
||||
libjsoncpp-dev \
|
||||
libnuma-dev \
|
||||
libibverbs-dev \
|
||||
libunwind-dev \
|
||||
libgoogle-glog-dev \
|
||||
libpython3-dev \
|
||||
libboost-all-dev \
|
||||
libssl-dev \
|
||||
libgrpc-dev \
|
||||
libgrpc++-dev \
|
||||
libprotobuf-dev \
|
||||
protobuf-compiler-grpc \
|
||||
pybind11-dev \
|
||||
libhiredis-dev \
|
||||
pkg-config \
|
||||
patchelf \
|
||||
ccache \
|
||||
libcurl4-openssl-dev \
|
||||
curl \
|
||||
pkg-config libczmq4 libczmq-dev \
|
||||
libnl-route-3-dev libnl-3-dev librdmacm1 \
|
||||
libhiredis-dev \
|
||||
nvidia-dkms-535 \
|
||||
build-essential \
|
||||
devscripts \
|
||||
debhelper \
|
||||
fakeroot \
|
||||
dkms \
|
||||
check \
|
||||
libsubunit0 \
|
||||
libsubunit-dev \
|
||||
libfabric-dev \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& ln -s /usr/bin/python3 /usr/bin/python
|
||||
|
||||
# CMake
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
wget \
|
||||
libssl-dev \
|
||||
&& wget https://github.com/Kitware/CMake/releases/download/v3.27.4/cmake-3.27.4-linux-x86_64.sh \
|
||||
RUN wget https://github.com/Kitware/CMake/releases/download/v3.27.4/cmake-3.27.4-linux-x86_64.sh \
|
||||
&& chmod +x cmake-3.27.4-linux-x86_64.sh \
|
||||
&& ./cmake-3.27.4-linux-x86_64.sh --skip-license --prefix=/usr/local \
|
||||
&& rm cmake-3.27.4-linux-x86_64.sh
|
||||
|
||||
# Python
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& ln -s /usr/bin/python3 /usr/bin/python
|
||||
|
||||
# GDRCopy
|
||||
WORKDIR /tmp
|
||||
RUN git clone https://github.com/NVIDIA/gdrcopy.git
|
||||
WORKDIR /tmp/gdrcopy
|
||||
RUN git checkout v2.4.4
|
||||
|
||||
RUN apt update
|
||||
RUN apt install -y nvidia-dkms-535
|
||||
RUN apt install -y build-essential devscripts debhelper fakeroot pkg-config dkms
|
||||
RUN apt install -y check libsubunit0 libsubunit-dev
|
||||
|
||||
WORKDIR /tmp/gdrcopy/packages
|
||||
RUN CUDA=/usr/local/cuda ./build-deb-packages.sh
|
||||
RUN dpkg -i gdrdrv-dkms_*.deb
|
||||
RUN dpkg -i libgdrapi_*.deb
|
||||
RUN dpkg -i gdrcopy-tests_*.deb
|
||||
RUN dpkg -i gdrcopy_*.deb
|
||||
|
||||
ENV GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
|
||||
# GDRCopy
|
||||
RUN mkdir -p /tmp \
|
||||
&& cd /tmp \
|
||||
&& git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
|
||||
&& cd /tmp/gdrcopy/packages \
|
||||
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
|
||||
&& dpkg -i gdrdrv-dkms_*.deb \
|
||||
&& dpkg -i libgdrapi_*.deb \
|
||||
&& dpkg -i gdrcopy-tests_*.deb \
|
||||
&& dpkg -i gdrcopy_*.deb
|
||||
|
||||
|
||||
# IBGDA dependency
|
||||
RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
|
||||
RUN apt-get install -y libfabric-dev
|
||||
|
||||
# DeepEP
|
||||
WORKDIR /sgl-workspace
|
||||
RUN git clone https://github.com/deepseek-ai/DeepEP.git
|
||||
|
||||
# NVSHMEM
|
||||
WORKDIR /sgl-workspace
|
||||
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
|
||||
RUN tar -xf nvshmem_src_3.2.5-1.txz \
|
||||
&& mv nvshmem_src nvshmem
|
||||
&& mv nvshmem_src nvshmem \
|
||||
&& cd /sgl-workspace/nvshmem \
|
||||
&& git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch \
|
||||
&& sed -i '1i#include <unistd.h>' /sgl-workspace/nvshmem/examples/moe_shuffle.cu \
|
||||
&& cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu
|
||||
|
||||
WORKDIR /sgl-workspace/nvshmem
|
||||
RUN git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch
|
||||
|
||||
RUN sed -i '1i#include <unistd.h>' /sgl-workspace/nvshmem/examples/moe_shuffle.cu && \
|
||||
cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu
|
||||
|
||||
WORKDIR /sgl-workspace/nvshmem
|
||||
# Compile NVSHMEM
|
||||
ENV CUDA_HOME=/usr/local/cuda
|
||||
RUN NVSHMEM_SHMEM_SUPPORT=0 \
|
||||
RUN cd /sgl-workspace/nvshmem && NVSHMEM_SHMEM_SUPPORT=0 \
|
||||
NVSHMEM_UCX_SUPPORT=0 \
|
||||
NVSHMEM_USE_NCCL=0 \
|
||||
NVSHMEM_MPI_SUPPORT=0 \
|
||||
@@ -77,5 +110,5 @@ WORKDIR /sgl-workspace/DeepEP
|
||||
ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install
|
||||
RUN NVSHMEM_DIR=/sgl-workspace/nvshmem/install pip install --break-system-packages .
|
||||
|
||||
# Set workspace
|
||||
WORKDIR /sgl-workspace
|
||||
# Install mooncake transfer engine
|
||||
RUN pip install --upgrade mooncake_transfer_engine --break-system-packages
|
||||
|
||||
Reference in New Issue
Block a user