From 712bf9ec9b5141cca13dcef168f15dc074781e58 Mon Sep 17 00:00:00 2001 From: ybyang <10629930+whybeyoung@users.noreply.github.com> Date: Thu, 19 Jun 2025 02:26:41 +0800 Subject: [PATCH] [pd] optimize dockerfile for pd disaggregation (#7319) Co-authored-by: zhyncs --- docker/Dockerfile.deepep | 119 +++++++++++++++++++++++++-------------- 1 file changed, 76 insertions(+), 43 deletions(-) diff --git a/docker/Dockerfile.deepep b/docker/Dockerfile.deepep index b88353466..8ddb6a459 100644 --- a/docker/Dockerfile.deepep +++ b/docker/Dockerfile.deepep @@ -1,67 +1,100 @@ ARG BASE_IMAGE FROM ${BASE_IMAGE} +# Deps +RUN apt-get update && apt-get install -y netcat-openbsd \ + libopenmpi-dev \ + kmod \ + rdma-core \ + infiniband-diags \ + openssh-server \ + perftest \ + ibverbs-providers \ + libibumad3 \ + libibverbs1 \ + libnl-3-200 \ + libnl-route-3-200 \ + librdmacm1 \ + build-essential \ + cmake \ + libibverbs-dev \ + libgoogle-glog-dev \ + libgtest-dev \ + libjsoncpp-dev \ + libnuma-dev \ + libibverbs-dev \ + libunwind-dev \ + libgoogle-glog-dev \ + libpython3-dev \ + libboost-all-dev \ + libssl-dev \ + libgrpc-dev \ + libgrpc++-dev \ + libprotobuf-dev \ + protobuf-compiler-grpc \ + pybind11-dev \ + libhiredis-dev \ + pkg-config \ + patchelf \ + ccache \ + libcurl4-openssl-dev \ + curl \ + pkg-config libczmq4 libczmq-dev \ + libnl-route-3-dev libnl-3-dev librdmacm1 \ + libhiredis-dev \ + nvidia-dkms-535 \ + build-essential \ + devscripts \ + debhelper \ + fakeroot \ + dkms \ + check \ + libsubunit0 \ + libsubunit-dev \ + libfabric-dev \ + python3 \ + python3-pip \ + && rm -rf /var/lib/apt/lists/* \ + && ln -s /usr/bin/python3 /usr/bin/python # CMake -RUN apt-get update \ -&& apt-get install -y --no-install-recommends \ -build-essential \ -wget \ -libssl-dev \ -&& wget https://github.com/Kitware/CMake/releases/download/v3.27.4/cmake-3.27.4-linux-x86_64.sh \ +RUN wget https://github.com/Kitware/CMake/releases/download/v3.27.4/cmake-3.27.4-linux-x86_64.sh \ && chmod +x cmake-3.27.4-linux-x86_64.sh \ && ./cmake-3.27.4-linux-x86_64.sh --skip-license --prefix=/usr/local \ && rm cmake-3.27.4-linux-x86_64.sh -# Python -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - python3 \ - python3-pip \ - && ln -s /usr/bin/python3 /usr/bin/python - -# GDRCopy -WORKDIR /tmp -RUN git clone https://github.com/NVIDIA/gdrcopy.git -WORKDIR /tmp/gdrcopy -RUN git checkout v2.4.4 - -RUN apt update -RUN apt install -y nvidia-dkms-535 -RUN apt install -y build-essential devscripts debhelper fakeroot pkg-config dkms -RUN apt install -y check libsubunit0 libsubunit-dev - -WORKDIR /tmp/gdrcopy/packages -RUN CUDA=/usr/local/cuda ./build-deb-packages.sh -RUN dpkg -i gdrdrv-dkms_*.deb -RUN dpkg -i libgdrapi_*.deb -RUN dpkg -i gdrcopy-tests_*.deb -RUN dpkg -i gdrcopy_*.deb ENV GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ +# GDRCopy +RUN mkdir -p /tmp \ + && cd /tmp \ + && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \ + && cd /tmp/gdrcopy/packages \ + && CUDA=/usr/local/cuda ./build-deb-packages.sh \ + && dpkg -i gdrdrv-dkms_*.deb \ + && dpkg -i libgdrapi_*.deb \ + && dpkg -i gdrcopy-tests_*.deb \ + && dpkg -i gdrcopy_*.deb + # IBGDA dependency RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so -RUN apt-get install -y libfabric-dev # DeepEP WORKDIR /sgl-workspace RUN git clone https://github.com/deepseek-ai/DeepEP.git # NVSHMEM -WORKDIR /sgl-workspace RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz RUN tar -xf nvshmem_src_3.2.5-1.txz \ - && mv nvshmem_src nvshmem + && mv nvshmem_src nvshmem \ + && cd /sgl-workspace/nvshmem \ + && git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch \ + && sed -i '1i#include ' /sgl-workspace/nvshmem/examples/moe_shuffle.cu \ + && cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu -WORKDIR /sgl-workspace/nvshmem -RUN git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch - -RUN sed -i '1i#include ' /sgl-workspace/nvshmem/examples/moe_shuffle.cu && \ - cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu - -WORKDIR /sgl-workspace/nvshmem +# Compile NVSHMEM ENV CUDA_HOME=/usr/local/cuda -RUN NVSHMEM_SHMEM_SUPPORT=0 \ +RUN cd /sgl-workspace/nvshmem && NVSHMEM_SHMEM_SUPPORT=0 \ NVSHMEM_UCX_SUPPORT=0 \ NVSHMEM_USE_NCCL=0 \ NVSHMEM_MPI_SUPPORT=0 \ @@ -77,5 +110,5 @@ WORKDIR /sgl-workspace/DeepEP ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install RUN NVSHMEM_DIR=/sgl-workspace/nvshmem/install pip install --break-system-packages . -# Set workspace -WORKDIR /sgl-workspace +# Install mooncake transfer engine +RUN pip install --upgrade mooncake_transfer_engine --break-system-packages