feat: unify dockerfiles (#10705)
Co-authored-by: Yineng Zhang <me@zhyncs.com>
This commit is contained in:
114
.github/workflows/release-docker-dev.yml
vendored
114
.github/workflows/release-docker-dev.yml
vendored
@@ -6,7 +6,7 @@ on:
|
|||||||
- cron: '0 0 * * *'
|
- cron: '0 0 * * *'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-dev:
|
build-dev-x86:
|
||||||
if: ${{ github.repository == 'sgl-project/sglang' }}
|
if: ${{ github.repository == 'sgl-project/sglang' }}
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
strategy:
|
strategy:
|
||||||
@@ -15,6 +15,46 @@ jobs:
|
|||||||
- version: 12.9.1
|
- version: 12.9.1
|
||||||
type: all
|
type: all
|
||||||
tag: dev
|
tag: dev
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Free disk space
|
||||||
|
uses: jlumbroso/free-disk-space@main
|
||||||
|
with:
|
||||||
|
tool-cache: false
|
||||||
|
docker-images: false
|
||||||
|
android: true
|
||||||
|
dotnet: true
|
||||||
|
haskell: true
|
||||||
|
large-packages: true
|
||||||
|
swap-storage: false
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Login to Docker Hub
|
||||||
|
uses: docker/login-action@v2
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Build and Push Dev Image (x86)
|
||||||
|
run: |
|
||||||
|
docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache .
|
||||||
|
|
||||||
|
build-blackwell-x86:
|
||||||
|
if: ${{ github.repository == 'sgl-project/sglang' }}
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
variant:
|
||||||
|
- version: 12.8.1
|
||||||
|
type: blackwell
|
||||||
|
tag: blackwell
|
||||||
|
- version: 12.9.1
|
||||||
|
type: blackwell
|
||||||
|
tag: blackwell-cu129
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
@@ -31,13 +71,79 @@ jobs:
|
|||||||
large-packages: true
|
large-packages: true
|
||||||
swap-storage: false
|
swap-storage: false
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
- name: Login to Docker Hub
|
- name: Login to Docker Hub
|
||||||
uses: docker/login-action@v2
|
uses: docker/login-action@v2
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
|
||||||
- name: Build and Push Dev Image
|
- name: Build and Push Blackwell Image (x86)
|
||||||
run: |
|
run: |
|
||||||
docker buildx build --output type=image,compression=zstd . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache
|
if [ "${{ matrix.variant.version }}" = "12.9.1" ]; then
|
||||||
docker push lmsysorg/sglang:${{ matrix.variant.tag }}
|
docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }}-x86 --no-cache .
|
||||||
|
else
|
||||||
|
docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache .
|
||||||
|
fi
|
||||||
|
|
||||||
|
build-blackwell-arm:
|
||||||
|
if: ${{ github.repository == 'sgl-project/sglang' }}
|
||||||
|
runs-on: ubuntu-22.04-arm
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
variant:
|
||||||
|
- version: 12.9.1
|
||||||
|
type: blackwell_aarch
|
||||||
|
tag: blackwell-cu129
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Free disk space
|
||||||
|
uses: jlumbroso/free-disk-space@main
|
||||||
|
with:
|
||||||
|
tool-cache: false
|
||||||
|
docker-images: false
|
||||||
|
android: true
|
||||||
|
dotnet: true
|
||||||
|
haskell: true
|
||||||
|
large-packages: true
|
||||||
|
swap-storage: false
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Login to Docker Hub
|
||||||
|
uses: docker/login-action@v2
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Build and Push Blackwell Image (ARM)
|
||||||
|
run: |
|
||||||
|
docker buildx build --platform linux/arm64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }}-arm64 --no-cache .
|
||||||
|
|
||||||
|
|
||||||
|
create-manifests:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
needs: [build-blackwell-x86, build-blackwell-arm]
|
||||||
|
if: ${{ github.repository == 'sgl-project/sglang' }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
variant:
|
||||||
|
- tag: blackwell-cu129
|
||||||
|
x86_tag: blackwell-cu129-x86
|
||||||
|
arm64_tag: blackwell-cu129-arm64
|
||||||
|
steps:
|
||||||
|
- uses: docker/setup-buildx-action@v3
|
||||||
|
- uses: docker/login-action@v2
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
- run: |
|
||||||
|
docker buildx imagetools create \
|
||||||
|
-t lmsysorg/sglang:${{ matrix.variant.tag }} \
|
||||||
|
lmsysorg/sglang:${{ matrix.variant.x86_tag }} \
|
||||||
|
lmsysorg/sglang:${{ matrix.variant.arm64_tag }}
|
||||||
|
|||||||
5
.github/workflows/release-docker-gb200.yml
vendored
5
.github/workflows/release-docker-gb200.yml
vendored
@@ -1,10 +1,5 @@
|
|||||||
name: Release Docker Images (GB200)
|
name: Release Docker Images (GB200)
|
||||||
on:
|
on:
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "python/sglang/version.py"
|
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|||||||
89
.github/workflows/release-docker.yml
vendored
89
.github/workflows/release-docker.yml
vendored
@@ -8,14 +8,19 @@ on:
|
|||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
publish:
|
publish-x86:
|
||||||
if: github.repository == 'sgl-project/sglang'
|
if: github.repository == 'sgl-project/sglang'
|
||||||
runs-on: ubuntu-latest
|
|
||||||
environment: 'prod'
|
environment: 'prod'
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
cuda_version: ['12.6.1', '12.9.1']
|
variant:
|
||||||
build_type: ['all']
|
- cuda_version: '12.6.1'
|
||||||
|
build_type: 'all'
|
||||||
|
- cuda_version: '12.8.1'
|
||||||
|
build_type: 'blackwell'
|
||||||
|
- cuda_version: '12.9.1'
|
||||||
|
build_type: 'blackwell'
|
||||||
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Delete huge unnecessary tools folder
|
- name: Delete huge unnecessary tools folder
|
||||||
run: rm -rf /opt/hostedtoolcache
|
run: rm -rf /opt/hostedtoolcache
|
||||||
@@ -34,29 +39,24 @@ jobs:
|
|||||||
large-packages: true
|
large-packages: true
|
||||||
swap-storage: false
|
swap-storage: false
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
- name: Login to Docker Hub
|
- name: Login to Docker Hub
|
||||||
uses: docker/login-action@v2
|
uses: docker/login-action@v2
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
|
||||||
- name: Build and Push
|
- name: Build and Push AMD64
|
||||||
run: |
|
run: |
|
||||||
version=$(cat python/sglang/version.py | cut -d'"' -f2)
|
version=$(cat python/sglang/version.py | cut -d'"' -f2)
|
||||||
|
|
||||||
if [ "${{ matrix.cuda_version }}" = "11.8.0" ]; then
|
if [ "${{ matrix.variant.cuda_version }}" = "12.6.1" ]; then
|
||||||
cuda_tag="cu118"
|
|
||||||
elif [ "${{ matrix.cuda_version }}" = "12.1.1" ]; then
|
|
||||||
cuda_tag="cu121"
|
|
||||||
elif [ "${{ matrix.cuda_version }}" = "12.4.1" ]; then
|
|
||||||
cuda_tag="cu124"
|
|
||||||
elif [ "${{ matrix.cuda_version }}" = "12.5.1" ]; then
|
|
||||||
cuda_tag="cu125"
|
|
||||||
elif [ "${{ matrix.cuda_version }}" = "12.6.1" ]; then
|
|
||||||
cuda_tag="cu126"
|
cuda_tag="cu126"
|
||||||
elif [ "${{ matrix.cuda_version }}" = "12.8.1" ]; then
|
elif [ "${{ matrix.variant.cuda_version }}" = "12.8.1" ]; then
|
||||||
cuda_tag="cu128"
|
cuda_tag="cu128"
|
||||||
elif [ "${{ matrix.cuda_version }}" = "12.9.1" ]; then
|
elif [ "${{ matrix.variant.cuda_version }}" = "12.9.1" ]; then
|
||||||
cuda_tag="cu129"
|
cuda_tag="cu129"
|
||||||
else
|
else
|
||||||
echo "Unsupported CUDA version"
|
echo "Unsupported CUDA version"
|
||||||
@@ -65,25 +65,58 @@ jobs:
|
|||||||
|
|
||||||
tag=v${version}-${cuda_tag}
|
tag=v${version}-${cuda_tag}
|
||||||
|
|
||||||
if [ "${{ matrix.build_type }}" = "all" ]; then
|
if [ "${{ matrix.variant.build_type }}" = "all" ]; then
|
||||||
tag_suffix=""
|
tag_suffix=""
|
||||||
elif [ "${{ matrix.build_type }}" = "srt" ]; then
|
elif [ "${{ matrix.variant.build_type }}" = "blackwell" ]; then
|
||||||
tag_suffix="-srt"
|
|
||||||
elif [ "${{ matrix.build_type }}" = "blackwell" ]; then
|
|
||||||
tag_suffix="-b200"
|
tag_suffix="-b200"
|
||||||
else
|
else
|
||||||
echo "Unsupported build type"
|
echo "Unsupported build type"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker buildx build --output type=image,compression=zstd . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache
|
if [ "${{ matrix.variant.cuda_version }}" = "12.9.1" ]; then
|
||||||
|
docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} -t lmsysorg/sglang:latest --no-cache .
|
||||||
if [ "${{ matrix.cuda_version }}" = "12.6.1" ]; then
|
else
|
||||||
docker tag lmsysorg/sglang:${tag}${tag_suffix} lmsysorg/sglang:latest${tag_suffix}
|
docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache .
|
||||||
docker push lmsysorg/sglang:latest${tag_suffix}
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "${{ matrix.cuda_version }}" = "12.9.1" ]; then
|
publish-arm64:
|
||||||
docker tag lmsysorg/sglang:${tag}${tag_suffix} lmsysorg/sglang:v${version}
|
if: github.repository == 'sgl-project/sglang'
|
||||||
docker push lmsysorg/sglang:v${version}
|
environment: 'prod'
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
variant:
|
||||||
|
- cuda_version: '12.9.1'
|
||||||
|
build_type: 'blackwell_aarch'
|
||||||
|
runs-on: ubuntu-22.04-arm
|
||||||
|
steps:
|
||||||
|
- name: Delete huge unnecessary tools folder
|
||||||
|
run: rm -rf /opt/hostedtoolcache
|
||||||
|
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Login to Docker Hub
|
||||||
|
uses: docker/login-action@v2
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Build and Push ARM64
|
||||||
|
run: |
|
||||||
|
version=$(cat python/sglang/version.py | cut -d'"' -f2)
|
||||||
|
|
||||||
|
if [ "${{ matrix.variant.cuda_version }}" = "12.9.1" ]; then
|
||||||
|
cuda_tag="cu129"
|
||||||
|
else
|
||||||
|
echo "Unsupported CUDA version"
|
||||||
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
tag=v${version}-${cuda_tag}
|
||||||
|
tag_suffix="-gb200"
|
||||||
|
|
||||||
|
docker buildx build --platform linux/arm64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache .
|
||||||
|
|||||||
8
.github/workflows/release-whl-kernel.yml
vendored
8
.github/workflows/release-whl-kernel.yml
vendored
@@ -44,7 +44,7 @@ jobs:
|
|||||||
working-directory: sgl-kernel
|
working-directory: sgl-kernel
|
||||||
run: |
|
run: |
|
||||||
pip install twine
|
pip install twine
|
||||||
python3 -m twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
|
python3 -m twine upload --skip-existing dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
|
||||||
|
|
||||||
build-cu124:
|
build-cu124:
|
||||||
if: github.repository == 'sgl-project/sglang'
|
if: github.repository == 'sgl-project/sglang'
|
||||||
@@ -227,6 +227,12 @@ jobs:
|
|||||||
chmod +x ./build.sh
|
chmod +x ./build.sh
|
||||||
./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}" aarch64
|
./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}" aarch64
|
||||||
|
|
||||||
|
- name: Upload to PyPI
|
||||||
|
working-directory: sgl-kernel
|
||||||
|
run: |
|
||||||
|
pip install twine
|
||||||
|
python3 -m twine upload --skip-existing dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
|
|||||||
@@ -1,10 +1,11 @@
|
|||||||
ARG CUDA_VERSION=12.9.1
|
ARG CUDA_VERSION=12.9.1
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 as base
|
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 AS base
|
||||||
|
|
||||||
ARG BUILD_TYPE=all
|
ARG BUILD_TYPE=all
|
||||||
ARG BRANCH_TYPE=remote
|
ARG BRANCH_TYPE=remote
|
||||||
ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
|
ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
|
||||||
ARG CMAKE_BUILD_PARALLEL_LEVEL=2
|
ARG CMAKE_BUILD_PARALLEL_LEVEL=2
|
||||||
|
ARG SGL_KERNEL_VERSION=0.3.12
|
||||||
ENV DEBIAN_FRONTEND=noninteractive \
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
CUDA_HOME=/usr/local/cuda \
|
CUDA_HOME=/usr/local/cuda \
|
||||||
GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ \
|
GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ \
|
||||||
@@ -57,7 +58,7 @@ RUN mkdir -p /tmp/gdrcopy && cd /tmp \
|
|||||||
&& cd / && rm -rf /tmp/gdrcopy
|
&& cd / && rm -rf /tmp/gdrcopy
|
||||||
|
|
||||||
# Fix DeepEP IBGDA symlink
|
# Fix DeepEP IBGDA symlink
|
||||||
RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
|
RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so
|
||||||
|
|
||||||
FROM scratch AS local_src
|
FROM scratch AS local_src
|
||||||
COPY . /src
|
COPY . /src
|
||||||
@@ -81,24 +82,33 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
|
|||||||
12.9.1) CUINDEX=129 ;; \
|
12.9.1) CUINDEX=129 ;; \
|
||||||
*) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
|
*) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
|
||||||
esac \
|
esac \
|
||||||
|
&& if [ "$CUDA_VERSION" = "12.6.1" ]; then \
|
||||||
|
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
|
||||||
|
fi \
|
||||||
|
&& if [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ]; then \
|
||||||
|
python3 -m pip install --no-cache-dir sgl-kernel==${SGL_KERNEL_VERSION} ; \
|
||||||
|
fi \
|
||||||
&& python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
|
&& python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
|
||||||
&& python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps \
|
&& python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps \
|
||||||
&& python3 -m flashinfer --download-cubin \
|
&& python3 -m flashinfer --download-cubin
|
||||||
&& if [ "$CUDA_VERSION" = "12.6.1" ]; then \
|
|
||||||
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.11/sgl_kernel-0.3.11+cu124-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Download source files
|
# Download source files
|
||||||
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
|
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
|
||||||
git clone https://github.com/deepseek-ai/DeepEP.git && \
|
if [ "$BUILD_TYPE" = "blackwell" ] && [ "$(uname -m)" = "aarch64" ]; then \
|
||||||
cd DeepEP && git checkout ${DEEPEP_COMMIT} && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \
|
git clone https://github.com/fzyzcjy/DeepEP.git \
|
||||||
cd .. && \
|
&& cd DeepEP && git checkout 1b14ad661c7640137fcfe93cccb2694ede1220b0 && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \
|
||||||
tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
|
else \
|
||||||
mv nvshmem_src nvshmem && \
|
git clone https://github.com/deepseek-ai/DeepEP.git \
|
||||||
rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
|
&& cd DeepEP && git checkout ${DEEPEP_COMMIT} && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \
|
||||||
|
fi \
|
||||||
|
&& tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
|
||||||
|
&& mv nvshmem_src nvshmem \
|
||||||
|
&& rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
|
||||||
|
|
||||||
# Build and install NVSHMEM
|
# Build and install NVSHMEM
|
||||||
RUN cd /sgl-workspace/nvshmem && \
|
RUN cd /sgl-workspace/nvshmem && \
|
||||||
|
if [ "$BUILD_TYPE" = "blackwell" ] || [ "$BUILD_TYPE" = "blackwell_aarch" ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \
|
||||||
NVSHMEM_SHMEM_SUPPORT=0 \
|
NVSHMEM_SHMEM_SUPPORT=0 \
|
||||||
NVSHMEM_UCX_SUPPORT=0 \
|
NVSHMEM_UCX_SUPPORT=0 \
|
||||||
NVSHMEM_USE_NCCL=0 \
|
NVSHMEM_USE_NCCL=0 \
|
||||||
@@ -107,7 +117,7 @@ RUN cd /sgl-workspace/nvshmem && \
|
|||||||
NVSHMEM_PMIX_SUPPORT=0 \
|
NVSHMEM_PMIX_SUPPORT=0 \
|
||||||
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
|
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
|
||||||
NVSHMEM_USE_GDRCOPY=1 \
|
NVSHMEM_USE_GDRCOPY=1 \
|
||||||
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="90" && \
|
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH} && \
|
||||||
cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL}
|
cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL}
|
||||||
|
|
||||||
# Install DeepEP
|
# Install DeepEP
|
||||||
@@ -125,6 +135,7 @@ RUN cd /sgl-workspace/DeepEP && \
|
|||||||
esac && \
|
esac && \
|
||||||
NVSHMEM_DIR=${NVSHMEM_DIR} TORCH_CUDA_ARCH_LIST="${CHOSEN_TORCH_CUDA_ARCH_LIST}" pip install .
|
NVSHMEM_DIR=${NVSHMEM_DIR} TORCH_CUDA_ARCH_LIST="${CHOSEN_TORCH_CUDA_ARCH_LIST}" pip install .
|
||||||
|
|
||||||
|
|
||||||
# Python tools
|
# Python tools
|
||||||
RUN python3 -m pip install --no-cache-dir \
|
RUN python3 -m pip install --no-cache-dir \
|
||||||
datamodel_code_generator \
|
datamodel_code_generator \
|
||||||
@@ -169,16 +180,16 @@ RUN apt-get update && apt-get install -y \
|
|||||||
|
|
||||||
RUN apt update -y \
|
RUN apt update -y \
|
||||||
&& apt install -y --no-install-recommends gnupg \
|
&& apt install -y --no-install-recommends gnupg \
|
||||||
&& echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64 /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \
|
&& echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2004/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \
|
||||||
&& apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub \
|
&& apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "x86_64"; fi)/7fa2af80.pub \
|
||||||
&& apt update -y \
|
&& apt update -y \
|
||||||
&& apt install nsight-systems-cli -y
|
&& apt install nsight-systems-cli -y
|
||||||
|
|
||||||
# Set up locale
|
# Set up locale
|
||||||
RUN locale-gen en_US.UTF-8
|
RUN locale-gen en_US.UTF-8
|
||||||
ENV LANG en_US.UTF-8
|
ENV LANG=en_US.UTF-8
|
||||||
ENV LANGUAGE en_US:en
|
ENV LANGUAGE=en_US:en
|
||||||
ENV LC_ALL en_US.UTF-8
|
ENV LC_ALL=en_US.UTF-8
|
||||||
|
|
||||||
# Install minimal Python packages
|
# Install minimal Python packages
|
||||||
RUN python3 -m pip install --no-cache-dir --break-system-packages \
|
RUN python3 -m pip install --no-cache-dir --break-system-packages \
|
||||||
@@ -186,7 +197,7 @@ RUN python3 -m pip install --no-cache-dir --break-system-packages \
|
|||||||
black \
|
black \
|
||||||
isort \
|
isort \
|
||||||
icdiff \
|
icdiff \
|
||||||
scikit_build_core \
|
scikit-build-core \
|
||||||
uv \
|
uv \
|
||||||
pre-commit \
|
pre-commit \
|
||||||
pandas \
|
pandas \
|
||||||
@@ -209,11 +220,14 @@ RUN curl -L https://github.com/clangd/clangd/releases/download/18.1.3/clangd-lin
|
|||||||
&& rm -rf clangd_18.1.3 clangd.zip
|
&& rm -rf clangd_18.1.3 clangd.zip
|
||||||
|
|
||||||
# Install CMake
|
# Install CMake
|
||||||
RUN wget https://github.com/Kitware/CMake/releases/download/v3.31.1/cmake-3.31.1-linux-x86_64.tar.gz \
|
RUN CMAKE_VERSION=3.31.1 \
|
||||||
&& tar -xzf cmake-3.31.1-linux-x86_64.tar.gz \
|
&& ARCH=$(uname -m) \
|
||||||
&& cp -r cmake-3.31.1-linux-x86_64/bin/* /usr/local/bin/ \
|
&& CMAKE_INSTALLER="cmake-${CMAKE_VERSION}-linux-${ARCH}" \
|
||||||
&& cp -r cmake-3.31.1-linux-x86_64/share/* /usr/local/share/ \
|
&& wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_INSTALLER}.tar.gz" \
|
||||||
&& rm -rf cmake-3.31.1-linux-x86_64 cmake-3.31.1-linux-x86_64.tar.gz
|
&& tar -xzf "${CMAKE_INSTALLER}.tar.gz" \
|
||||||
|
&& cp -r "${CMAKE_INSTALLER}/bin/"* /usr/local/bin/ \
|
||||||
|
&& cp -r "${CMAKE_INSTALLER}/share/"* /usr/local/share/ \
|
||||||
|
&& rm -rf "${CMAKE_INSTALLER}" "${CMAKE_INSTALLER}.tar.gz"
|
||||||
|
|
||||||
# Install Rust toolchain for sgl-router
|
# Install Rust toolchain for sgl-router
|
||||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||||
|
|||||||
Reference in New Issue
Block a user