[pd] optimize dockerfile for pd disaggregation (#7319)
Co-authored-by: zhyncs <me@zhyncs.com>
This commit is contained in:
@@ -1,67 +1,100 @@
|
|||||||
ARG BASE_IMAGE
|
ARG BASE_IMAGE
|
||||||
FROM ${BASE_IMAGE}
|
FROM ${BASE_IMAGE}
|
||||||
|
# Deps
|
||||||
|
RUN apt-get update && apt-get install -y netcat-openbsd \
|
||||||
|
libopenmpi-dev \
|
||||||
|
kmod \
|
||||||
|
rdma-core \
|
||||||
|
infiniband-diags \
|
||||||
|
openssh-server \
|
||||||
|
perftest \
|
||||||
|
ibverbs-providers \
|
||||||
|
libibumad3 \
|
||||||
|
libibverbs1 \
|
||||||
|
libnl-3-200 \
|
||||||
|
libnl-route-3-200 \
|
||||||
|
librdmacm1 \
|
||||||
|
build-essential \
|
||||||
|
cmake \
|
||||||
|
libibverbs-dev \
|
||||||
|
libgoogle-glog-dev \
|
||||||
|
libgtest-dev \
|
||||||
|
libjsoncpp-dev \
|
||||||
|
libnuma-dev \
|
||||||
|
libibverbs-dev \
|
||||||
|
libunwind-dev \
|
||||||
|
libgoogle-glog-dev \
|
||||||
|
libpython3-dev \
|
||||||
|
libboost-all-dev \
|
||||||
|
libssl-dev \
|
||||||
|
libgrpc-dev \
|
||||||
|
libgrpc++-dev \
|
||||||
|
libprotobuf-dev \
|
||||||
|
protobuf-compiler-grpc \
|
||||||
|
pybind11-dev \
|
||||||
|
libhiredis-dev \
|
||||||
|
pkg-config \
|
||||||
|
patchelf \
|
||||||
|
ccache \
|
||||||
|
libcurl4-openssl-dev \
|
||||||
|
curl \
|
||||||
|
pkg-config libczmq4 libczmq-dev \
|
||||||
|
libnl-route-3-dev libnl-3-dev librdmacm1 \
|
||||||
|
libhiredis-dev \
|
||||||
|
nvidia-dkms-535 \
|
||||||
|
build-essential \
|
||||||
|
devscripts \
|
||||||
|
debhelper \
|
||||||
|
fakeroot \
|
||||||
|
dkms \
|
||||||
|
check \
|
||||||
|
libsubunit0 \
|
||||||
|
libsubunit-dev \
|
||||||
|
libfabric-dev \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& ln -s /usr/bin/python3 /usr/bin/python
|
||||||
|
|
||||||
# CMake
|
# CMake
|
||||||
RUN apt-get update \
|
RUN wget https://github.com/Kitware/CMake/releases/download/v3.27.4/cmake-3.27.4-linux-x86_64.sh \
|
||||||
&& apt-get install -y --no-install-recommends \
|
|
||||||
build-essential \
|
|
||||||
wget \
|
|
||||||
libssl-dev \
|
|
||||||
&& wget https://github.com/Kitware/CMake/releases/download/v3.27.4/cmake-3.27.4-linux-x86_64.sh \
|
|
||||||
&& chmod +x cmake-3.27.4-linux-x86_64.sh \
|
&& chmod +x cmake-3.27.4-linux-x86_64.sh \
|
||||||
&& ./cmake-3.27.4-linux-x86_64.sh --skip-license --prefix=/usr/local \
|
&& ./cmake-3.27.4-linux-x86_64.sh --skip-license --prefix=/usr/local \
|
||||||
&& rm cmake-3.27.4-linux-x86_64.sh
|
&& rm cmake-3.27.4-linux-x86_64.sh
|
||||||
|
|
||||||
# Python
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y --no-install-recommends \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
&& ln -s /usr/bin/python3 /usr/bin/python
|
|
||||||
|
|
||||||
# GDRCopy
|
|
||||||
WORKDIR /tmp
|
|
||||||
RUN git clone https://github.com/NVIDIA/gdrcopy.git
|
|
||||||
WORKDIR /tmp/gdrcopy
|
|
||||||
RUN git checkout v2.4.4
|
|
||||||
|
|
||||||
RUN apt update
|
|
||||||
RUN apt install -y nvidia-dkms-535
|
|
||||||
RUN apt install -y build-essential devscripts debhelper fakeroot pkg-config dkms
|
|
||||||
RUN apt install -y check libsubunit0 libsubunit-dev
|
|
||||||
|
|
||||||
WORKDIR /tmp/gdrcopy/packages
|
|
||||||
RUN CUDA=/usr/local/cuda ./build-deb-packages.sh
|
|
||||||
RUN dpkg -i gdrdrv-dkms_*.deb
|
|
||||||
RUN dpkg -i libgdrapi_*.deb
|
|
||||||
RUN dpkg -i gdrcopy-tests_*.deb
|
|
||||||
RUN dpkg -i gdrcopy_*.deb
|
|
||||||
|
|
||||||
ENV GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
|
ENV GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
|
||||||
|
# GDRCopy
|
||||||
|
RUN mkdir -p /tmp \
|
||||||
|
&& cd /tmp \
|
||||||
|
&& git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
|
||||||
|
&& cd /tmp/gdrcopy/packages \
|
||||||
|
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
|
||||||
|
&& dpkg -i gdrdrv-dkms_*.deb \
|
||||||
|
&& dpkg -i libgdrapi_*.deb \
|
||||||
|
&& dpkg -i gdrcopy-tests_*.deb \
|
||||||
|
&& dpkg -i gdrcopy_*.deb
|
||||||
|
|
||||||
|
|
||||||
# IBGDA dependency
|
# IBGDA dependency
|
||||||
RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
|
RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
|
||||||
RUN apt-get install -y libfabric-dev
|
|
||||||
|
|
||||||
# DeepEP
|
# DeepEP
|
||||||
WORKDIR /sgl-workspace
|
WORKDIR /sgl-workspace
|
||||||
RUN git clone https://github.com/deepseek-ai/DeepEP.git
|
RUN git clone https://github.com/deepseek-ai/DeepEP.git
|
||||||
|
|
||||||
# NVSHMEM
|
# NVSHMEM
|
||||||
WORKDIR /sgl-workspace
|
|
||||||
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
|
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
|
||||||
RUN tar -xf nvshmem_src_3.2.5-1.txz \
|
RUN tar -xf nvshmem_src_3.2.5-1.txz \
|
||||||
&& mv nvshmem_src nvshmem
|
&& mv nvshmem_src nvshmem \
|
||||||
|
&& cd /sgl-workspace/nvshmem \
|
||||||
|
&& git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch \
|
||||||
|
&& sed -i '1i#include <unistd.h>' /sgl-workspace/nvshmem/examples/moe_shuffle.cu \
|
||||||
|
&& cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu
|
||||||
|
|
||||||
WORKDIR /sgl-workspace/nvshmem
|
# Compile NVSHMEM
|
||||||
RUN git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch
|
|
||||||
|
|
||||||
RUN sed -i '1i#include <unistd.h>' /sgl-workspace/nvshmem/examples/moe_shuffle.cu && \
|
|
||||||
cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu
|
|
||||||
|
|
||||||
WORKDIR /sgl-workspace/nvshmem
|
|
||||||
ENV CUDA_HOME=/usr/local/cuda
|
ENV CUDA_HOME=/usr/local/cuda
|
||||||
RUN NVSHMEM_SHMEM_SUPPORT=0 \
|
RUN cd /sgl-workspace/nvshmem && NVSHMEM_SHMEM_SUPPORT=0 \
|
||||||
NVSHMEM_UCX_SUPPORT=0 \
|
NVSHMEM_UCX_SUPPORT=0 \
|
||||||
NVSHMEM_USE_NCCL=0 \
|
NVSHMEM_USE_NCCL=0 \
|
||||||
NVSHMEM_MPI_SUPPORT=0 \
|
NVSHMEM_MPI_SUPPORT=0 \
|
||||||
@@ -77,5 +110,5 @@ WORKDIR /sgl-workspace/DeepEP
|
|||||||
ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install
|
ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install
|
||||||
RUN NVSHMEM_DIR=/sgl-workspace/nvshmem/install pip install --break-system-packages .
|
RUN NVSHMEM_DIR=/sgl-workspace/nvshmem/install pip install --break-system-packages .
|
||||||
|
|
||||||
# Set workspace
|
# Install mooncake transfer engine
|
||||||
WORKDIR /sgl-workspace
|
RUN pip install --upgrade mooncake_transfer_engine --break-system-packages
|
||||||
|
|||||||
Reference in New Issue
Block a user