diff --git a/.github/workflows/release-docker-dev.yml b/.github/workflows/release-docker-dev.yml index 561309ff3..746f641ae 100644 --- a/.github/workflows/release-docker-dev.yml +++ b/.github/workflows/release-docker-dev.yml @@ -6,7 +6,7 @@ on: - cron: '0 0 * * *' jobs: - build-dev: + build-dev-x86: if: ${{ github.repository == 'sgl-project/sglang' }} runs-on: ubuntu-22.04 strategy: @@ -15,6 +15,46 @@ jobs: - version: 12.9.1 type: all tag: dev + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Free disk space + uses: jlumbroso/free-disk-space@main + with: + tool-cache: false + docker-images: false + android: true + dotnet: true + haskell: true + large-packages: true + swap-storage: false + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and Push Dev Image (x86) + run: | + docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache . + + build-blackwell-x86: + if: ${{ github.repository == 'sgl-project/sglang' }} + runs-on: ubuntu-latest + strategy: + matrix: + variant: + - version: 12.8.1 + type: blackwell + tag: blackwell + - version: 12.9.1 + type: blackwell + tag: blackwell-cu129 steps: - name: Checkout repository @@ -31,13 +71,79 @@ jobs: large-packages: true swap-storage: false + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to Docker Hub uses: docker/login-action@v2 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build and Push Dev Image + - name: Build and Push Blackwell Image (x86) run: | - docker buildx build --output type=image,compression=zstd . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache - docker push lmsysorg/sglang:${{ matrix.variant.tag }} + if [ "${{ matrix.variant.version }}" = "12.9.1" ]; then + docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }}-x86 --no-cache . + else + docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache . + fi + + build-blackwell-arm: + if: ${{ github.repository == 'sgl-project/sglang' }} + runs-on: ubuntu-22.04-arm + strategy: + matrix: + variant: + - version: 12.9.1 + type: blackwell_aarch + tag: blackwell-cu129 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Free disk space + uses: jlumbroso/free-disk-space@main + with: + tool-cache: false + docker-images: false + android: true + dotnet: true + haskell: true + large-packages: true + swap-storage: false + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and Push Blackwell Image (ARM) + run: | + docker buildx build --platform linux/arm64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }}-arm64 --no-cache . + + + create-manifests: + runs-on: ubuntu-22.04 + needs: [build-blackwell-x86, build-blackwell-arm] + if: ${{ github.repository == 'sgl-project/sglang' }} + strategy: + matrix: + variant: + - tag: blackwell-cu129 + x86_tag: blackwell-cu129-x86 + arm64_tag: blackwell-cu129-arm64 + steps: + - uses: docker/setup-buildx-action@v3 + - uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - run: | + docker buildx imagetools create \ + -t lmsysorg/sglang:${{ matrix.variant.tag }} \ + lmsysorg/sglang:${{ matrix.variant.x86_tag }} \ + lmsysorg/sglang:${{ matrix.variant.arm64_tag }} diff --git a/.github/workflows/release-docker-gb200.yml b/.github/workflows/release-docker-gb200.yml index 87de03b85..c740d38a4 100644 --- a/.github/workflows/release-docker-gb200.yml +++ b/.github/workflows/release-docker-gb200.yml @@ -1,10 +1,5 @@ name: Release Docker Images (GB200) on: - push: - branches: - - main - paths: - - "python/sglang/version.py" workflow_dispatch: jobs: diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 711cad46b..3d3c7e5e7 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -8,14 +8,19 @@ on: workflow_dispatch: jobs: - publish: + publish-x86: if: github.repository == 'sgl-project/sglang' - runs-on: ubuntu-latest environment: 'prod' strategy: matrix: - cuda_version: ['12.6.1', '12.9.1'] - build_type: ['all'] + variant: + - cuda_version: '12.6.1' + build_type: 'all' + - cuda_version: '12.8.1' + build_type: 'blackwell' + - cuda_version: '12.9.1' + build_type: 'blackwell' + runs-on: ubuntu-latest steps: - name: Delete huge unnecessary tools folder run: rm -rf /opt/hostedtoolcache @@ -34,29 +39,24 @@ jobs: large-packages: true swap-storage: false + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to Docker Hub uses: docker/login-action@v2 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build and Push + - name: Build and Push AMD64 run: | version=$(cat python/sglang/version.py | cut -d'"' -f2) - if [ "${{ matrix.cuda_version }}" = "11.8.0" ]; then - cuda_tag="cu118" - elif [ "${{ matrix.cuda_version }}" = "12.1.1" ]; then - cuda_tag="cu121" - elif [ "${{ matrix.cuda_version }}" = "12.4.1" ]; then - cuda_tag="cu124" - elif [ "${{ matrix.cuda_version }}" = "12.5.1" ]; then - cuda_tag="cu125" - elif [ "${{ matrix.cuda_version }}" = "12.6.1" ]; then + if [ "${{ matrix.variant.cuda_version }}" = "12.6.1" ]; then cuda_tag="cu126" - elif [ "${{ matrix.cuda_version }}" = "12.8.1" ]; then + elif [ "${{ matrix.variant.cuda_version }}" = "12.8.1" ]; then cuda_tag="cu128" - elif [ "${{ matrix.cuda_version }}" = "12.9.1" ]; then + elif [ "${{ matrix.variant.cuda_version }}" = "12.9.1" ]; then cuda_tag="cu129" else echo "Unsupported CUDA version" @@ -65,25 +65,58 @@ jobs: tag=v${version}-${cuda_tag} - if [ "${{ matrix.build_type }}" = "all" ]; then + if [ "${{ matrix.variant.build_type }}" = "all" ]; then tag_suffix="" - elif [ "${{ matrix.build_type }}" = "srt" ]; then - tag_suffix="-srt" - elif [ "${{ matrix.build_type }}" = "blackwell" ]; then + elif [ "${{ matrix.variant.build_type }}" = "blackwell" ]; then tag_suffix="-b200" else echo "Unsupported build type" exit 1 fi - docker buildx build --output type=image,compression=zstd . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache - - if [ "${{ matrix.cuda_version }}" = "12.6.1" ]; then - docker tag lmsysorg/sglang:${tag}${tag_suffix} lmsysorg/sglang:latest${tag_suffix} - docker push lmsysorg/sglang:latest${tag_suffix} + if [ "${{ matrix.variant.cuda_version }}" = "12.9.1" ]; then + docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} -t lmsysorg/sglang:latest --no-cache . + else + docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache . fi - if [ "${{ matrix.cuda_version }}" = "12.9.1" ]; then - docker tag lmsysorg/sglang:${tag}${tag_suffix} lmsysorg/sglang:v${version} - docker push lmsysorg/sglang:v${version} + publish-arm64: + if: github.repository == 'sgl-project/sglang' + environment: 'prod' + strategy: + matrix: + variant: + - cuda_version: '12.9.1' + build_type: 'blackwell_aarch' + runs-on: ubuntu-22.04-arm + steps: + - name: Delete huge unnecessary tools folder + run: rm -rf /opt/hostedtoolcache + + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and Push ARM64 + run: | + version=$(cat python/sglang/version.py | cut -d'"' -f2) + + if [ "${{ matrix.variant.cuda_version }}" = "12.9.1" ]; then + cuda_tag="cu129" + else + echo "Unsupported CUDA version" + exit 1 fi + + tag=v${version}-${cuda_tag} + tag_suffix="-gb200" + + docker buildx build --platform linux/arm64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache . diff --git a/.github/workflows/release-whl-kernel.yml b/.github/workflows/release-whl-kernel.yml index ffc8eb450..75f94996f 100644 --- a/.github/workflows/release-whl-kernel.yml +++ b/.github/workflows/release-whl-kernel.yml @@ -44,7 +44,7 @@ jobs: working-directory: sgl-kernel run: | pip install twine - python3 -m twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }} + python3 -m twine upload --skip-existing dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }} build-cu124: if: github.repository == 'sgl-project/sglang' @@ -227,6 +227,12 @@ jobs: chmod +x ./build.sh ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}" aarch64 + - name: Upload to PyPI + working-directory: sgl-kernel + run: | + pip install twine + python3 -m twine upload --skip-existing dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }} + - name: Upload artifacts uses: actions/upload-artifact@v4 with: diff --git a/docker/Dockerfile b/docker/Dockerfile index 2f4c22d90..45c6e5dd7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,10 +1,11 @@ ARG CUDA_VERSION=12.9.1 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 as base +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 AS base ARG BUILD_TYPE=all ARG BRANCH_TYPE=remote ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee ARG CMAKE_BUILD_PARALLEL_LEVEL=2 +ARG SGL_KERNEL_VERSION=0.3.12 ENV DEBIAN_FRONTEND=noninteractive \ CUDA_HOME=/usr/local/cuda \ GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ \ @@ -57,7 +58,7 @@ RUN mkdir -p /tmp/gdrcopy && cd /tmp \ && cd / && rm -rf /tmp/gdrcopy # Fix DeepEP IBGDA symlink -RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so +RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so FROM scratch AS local_src COPY . /src @@ -81,24 +82,33 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li 12.9.1) CUINDEX=129 ;; \ *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \ esac \ + && if [ "$CUDA_VERSION" = "12.6.1" ]; then \ + python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \ + fi \ +&& if [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ]; then \ + python3 -m pip install --no-cache-dir sgl-kernel==${SGL_KERNEL_VERSION} ; \ + fi \ && python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \ && python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps \ - && python3 -m flashinfer --download-cubin \ - && if [ "$CUDA_VERSION" = "12.6.1" ]; then \ - python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.11/sgl_kernel-0.3.11+cu124-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \ - fi + && python3 -m flashinfer --download-cubin + # Download source files RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \ - git clone https://github.com/deepseek-ai/DeepEP.git && \ - cd DeepEP && git checkout ${DEEPEP_COMMIT} && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \ - cd .. && \ - tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \ - mv nvshmem_src nvshmem && \ - rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz + if [ "$BUILD_TYPE" = "blackwell" ] && [ "$(uname -m)" = "aarch64" ]; then \ + git clone https://github.com/fzyzcjy/DeepEP.git \ + && cd DeepEP && git checkout 1b14ad661c7640137fcfe93cccb2694ede1220b0 && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \ + else \ + git clone https://github.com/deepseek-ai/DeepEP.git \ + && cd DeepEP && git checkout ${DEEPEP_COMMIT} && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \ + fi \ + && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ + && mv nvshmem_src nvshmem \ + && rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz # Build and install NVSHMEM RUN cd /sgl-workspace/nvshmem && \ + if [ "$BUILD_TYPE" = "blackwell" ] || [ "$BUILD_TYPE" = "blackwell_aarch" ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \ NVSHMEM_SHMEM_SUPPORT=0 \ NVSHMEM_UCX_SUPPORT=0 \ NVSHMEM_USE_NCCL=0 \ @@ -107,7 +117,7 @@ RUN cd /sgl-workspace/nvshmem && \ NVSHMEM_PMIX_SUPPORT=0 \ NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ NVSHMEM_USE_GDRCOPY=1 \ - cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="90" && \ + cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} && \ cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL} # Install DeepEP @@ -125,6 +135,7 @@ RUN cd /sgl-workspace/DeepEP && \ esac && \ NVSHMEM_DIR=${NVSHMEM_DIR} TORCH_CUDA_ARCH_LIST="${CHOSEN_TORCH_CUDA_ARCH_LIST}" pip install . + # Python tools RUN python3 -m pip install --no-cache-dir \ datamodel_code_generator \ @@ -169,16 +180,16 @@ RUN apt-get update && apt-get install -y \ RUN apt update -y \ && apt install -y --no-install-recommends gnupg \ - && echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64 /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \ - && apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub \ + && echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2004/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \ + && apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "x86_64"; fi)/7fa2af80.pub \ && apt update -y \ && apt install nsight-systems-cli -y # Set up locale RUN locale-gen en_US.UTF-8 -ENV LANG en_US.UTF-8 -ENV LANGUAGE en_US:en -ENV LC_ALL en_US.UTF-8 +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US:en +ENV LC_ALL=en_US.UTF-8 # Install minimal Python packages RUN python3 -m pip install --no-cache-dir --break-system-packages \ @@ -186,7 +197,7 @@ RUN python3 -m pip install --no-cache-dir --break-system-packages \ black \ isort \ icdiff \ - scikit_build_core \ + scikit-build-core \ uv \ pre-commit \ pandas \ @@ -209,11 +220,14 @@ RUN curl -L https://github.com/clangd/clangd/releases/download/18.1.3/clangd-lin && rm -rf clangd_18.1.3 clangd.zip # Install CMake -RUN wget https://github.com/Kitware/CMake/releases/download/v3.31.1/cmake-3.31.1-linux-x86_64.tar.gz \ - && tar -xzf cmake-3.31.1-linux-x86_64.tar.gz \ - && cp -r cmake-3.31.1-linux-x86_64/bin/* /usr/local/bin/ \ - && cp -r cmake-3.31.1-linux-x86_64/share/* /usr/local/share/ \ - && rm -rf cmake-3.31.1-linux-x86_64 cmake-3.31.1-linux-x86_64.tar.gz +RUN CMAKE_VERSION=3.31.1 \ + && ARCH=$(uname -m) \ + && CMAKE_INSTALLER="cmake-${CMAKE_VERSION}-linux-${ARCH}" \ + && wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_INSTALLER}.tar.gz" \ + && tar -xzf "${CMAKE_INSTALLER}.tar.gz" \ + && cp -r "${CMAKE_INSTALLER}/bin/"* /usr/local/bin/ \ + && cp -r "${CMAKE_INSTALLER}/share/"* /usr/local/share/ \ + && rm -rf "${CMAKE_INSTALLER}" "${CMAKE_INSTALLER}.tar.gz" # Install Rust toolchain for sgl-router ENV PATH="/root/.cargo/bin:${PATH}"