From 0166403c20295ee549ea3d5c57a41dac847f26a4 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Thu, 5 Jun 2025 15:07:53 +0800 Subject: [PATCH] Support Blackwell DeepEP docker images (#6868) --- .github/workflows/release-docker-deepep.yml | 17 +++- .../workflows/release-docker-dev-deepep.yml | 36 --------- docker/Dockerfile.deepep | 8 +- docker/Dockerfile.dev | 3 + docker/Dockerfile.dev-deepep | 80 ------------------- 5 files changed, 23 insertions(+), 121 deletions(-) delete mode 100644 .github/workflows/release-docker-dev-deepep.yml delete mode 100644 docker/Dockerfile.dev-deepep diff --git a/.github/workflows/release-docker-deepep.yml b/.github/workflows/release-docker-deepep.yml index 9f8607d9a..25992a280 100644 --- a/.github/workflows/release-docker-deepep.yml +++ b/.github/workflows/release-docker-deepep.yml @@ -9,6 +9,17 @@ jobs: build-dev: if: ${{ github.repository == 'sgl-project/sglang' }} runs-on: ubuntu-22.04 + + strategy: + matrix: + variant: + - base: lmsysorg/sglang:latest + tag: deepep + - base: lmsysorg/sglang:dev + tag: dev-deepep + - base: lmsysorg/sglang:blackwell + tag: blackwell-deepep + steps: - name: Checkout repository uses: actions/checkout@v4 @@ -30,7 +41,7 @@ jobs: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build and Push DeepEP Image + - name: Build and Push Docker Image run: | - docker build . -f docker/Dockerfile.deepep -t lmsysorg/sglang:deepep --no-cache - docker push lmsysorg/sglang:deepep + docker build . -f docker/Dockerfile.deepep --build-arg BASE_IMAGE=${{ matrix.variant.base }} -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache + docker push lmsysorg/sglang:${{ matrix.variant.tag }} diff --git a/.github/workflows/release-docker-dev-deepep.yml b/.github/workflows/release-docker-dev-deepep.yml deleted file mode 100644 index 5c7a4d6f3..000000000 --- a/.github/workflows/release-docker-dev-deepep.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: Build Dev-DeepEP Docker Image - -on: - workflow_dispatch: - schedule: - - cron: '0 0 * * *' - -jobs: - build-dev: - if: ${{ github.repository == 'sgl-project/sglang' }} - runs-on: ubuntu-22.04 - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Free disk space - uses: jlumbroso/free-disk-space@main - with: - tool-cache: false - docker-images: false - android: true - dotnet: true - haskell: true - large-packages: true - swap-storage: false - - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - - name: Build and Push DeepEP Image - run: | - docker build . -f docker/Dockerfile.dev-deepep -t lmsysorg/sglang:dev-deepep --no-cache - docker push lmsysorg/sglang:dev-deepep diff --git a/docker/Dockerfile.deepep b/docker/Dockerfile.deepep index e54e9df8d..b27e102b9 100644 --- a/docker/Dockerfile.deepep +++ b/docker/Dockerfile.deepep @@ -1,4 +1,5 @@ -FROM lmsysorg/sglang:latest +ARG BASE_IMAGE +FROM ${BASE_IMAGE} # CMake RUN apt-get update \ @@ -55,6 +56,9 @@ RUN tar -xf nvshmem_src_3.2.5-1.txz \ WORKDIR /sgl-workspace/nvshmem RUN git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch +RUN sed -i '1i#include ' /sgl-workspace/nvshmem/examples/moe_shuffle.cu && \ + cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu + WORKDIR /sgl-workspace/nvshmem ENV CUDA_HOME=/usr/local/cuda RUN NVSHMEM_SHMEM_SUPPORT=0 \ @@ -71,7 +75,7 @@ RUN NVSHMEM_SHMEM_SUPPORT=0 \ WORKDIR /sgl-workspace/DeepEP ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install -RUN NVSHMEM_DIR=/sgl-workspace/nvshmem/install pip install . +RUN NVSHMEM_DIR=/sgl-workspace/nvshmem/install pip install --break-system-packages . # Set workspace WORKDIR /sgl-workspace diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev index 67310adaa..6fa0eb2d8 100644 --- a/docker/Dockerfile.dev +++ b/docker/Dockerfile.dev @@ -224,5 +224,8 @@ setopt HIST_FIND_NO_DUPS setopt INC_APPEND_HISTORY EOF +RUN set -euxo ; \ + curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to /usr/local/bin + # Set workspace directory WORKDIR /sgl-workspace/sglang diff --git a/docker/Dockerfile.dev-deepep b/docker/Dockerfile.dev-deepep deleted file mode 100644 index 71aaa8722..000000000 --- a/docker/Dockerfile.dev-deepep +++ /dev/null @@ -1,80 +0,0 @@ -FROM lmsysorg/sglang:dev - -# CMake -RUN apt-get update \ -&& apt-get install -y --no-install-recommends \ -build-essential \ -wget \ -libssl-dev \ -&& wget https://github.com/Kitware/CMake/releases/download/v3.27.4/cmake-3.27.4-linux-x86_64.sh \ -&& chmod +x cmake-3.27.4-linux-x86_64.sh \ -&& ./cmake-3.27.4-linux-x86_64.sh --skip-license --prefix=/usr/local \ -&& rm cmake-3.27.4-linux-x86_64.sh - -# Python -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - python3 \ - python3-pip \ - && ln -s /usr/bin/python3 /usr/bin/python - -# GDRCopy -WORKDIR /tmp -RUN git clone https://github.com/NVIDIA/gdrcopy.git -WORKDIR /tmp/gdrcopy -RUN git checkout v2.4.4 - -RUN apt update -RUN apt install -y nvidia-dkms-535 -RUN apt install -y build-essential devscripts debhelper fakeroot pkg-config dkms -RUN apt install -y check libsubunit0 libsubunit-dev - -WORKDIR /tmp/gdrcopy/packages -RUN CUDA=/usr/local/cuda ./build-deb-packages.sh -RUN dpkg -i gdrdrv-dkms_*.deb -RUN dpkg -i libgdrapi_*.deb -RUN dpkg -i gdrcopy-tests_*.deb -RUN dpkg -i gdrcopy_*.deb - -ENV GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ - -# IBGDA dependency -RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so -RUN apt-get install -y libfabric-dev - -# DeepEP -WORKDIR /sgl-workspace -RUN git clone https://github.com/deepseek-ai/DeepEP.git - -# NVSHMEM -WORKDIR /sgl-workspace -RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz -RUN tar -xf nvshmem_src_3.2.5-1.txz \ - && mv nvshmem_src nvshmem - -WORKDIR /sgl-workspace/nvshmem -RUN git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch - -WORKDIR /sgl-workspace/nvshmem -ENV CUDA_HOME=/usr/local/cuda -RUN NVSHMEM_SHMEM_SUPPORT=0 \ - NVSHMEM_UCX_SUPPORT=0 \ - NVSHMEM_USE_NCCL=0 \ - NVSHMEM_MPI_SUPPORT=0 \ - NVSHMEM_IBGDA_SUPPORT=1 \ - NVSHMEM_PMIX_SUPPORT=0 \ - NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ - NVSHMEM_USE_GDRCOPY=1 \ - cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/sgl-workspace/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \ - && cd build \ - && make install -j - -WORKDIR /sgl-workspace/DeepEP -ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install -RUN NVSHMEM_DIR=/sgl-workspace/nvshmem/install pip install . - -RUN set -euxo ; \ - curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to /usr/local/bin - -# Set workspace -WORKDIR /sgl-workspace