chore(gb200): update to CUDA 12.9 and improve build process (#8772)
This commit is contained in:
6
.github/workflows/release-docker-gb200.yml
vendored
6
.github/workflows/release-docker-gb200.yml
vendored
@@ -11,7 +11,7 @@ jobs:
|
|||||||
publish:
|
publish:
|
||||||
if: github.repository == 'sgl-project/sglang'
|
if: github.repository == 'sgl-project/sglang'
|
||||||
runs-on: ubuntu-22.04-arm
|
runs-on: ubuntu-22.04-arm
|
||||||
environment: 'prod'
|
environment: "prod"
|
||||||
steps:
|
steps:
|
||||||
- name: Delete huge unnecessary tools folder
|
- name: Delete huge unnecessary tools folder
|
||||||
run: rm -rf /opt/hostedtoolcache
|
run: rm -rf /opt/hostedtoolcache
|
||||||
@@ -31,6 +31,6 @@ jobs:
|
|||||||
- name: Build and Push
|
- name: Build and Push
|
||||||
run: |
|
run: |
|
||||||
version=$(cat python/sglang/version.py | cut -d'"' -f2)
|
version=$(cat python/sglang/version.py | cut -d'"' -f2)
|
||||||
tag=v${version}-cu128-gb200
|
tag=v${version}-cu129-gb200
|
||||||
|
|
||||||
docker buildx build --platform linux/arm64 --push --output type=image -t lmsysorg/sglang:${tag} -f docker/Dockerfile.gb200 --build-arg CUDA_VERSION=12.8.1 --build-arg BUILD_TYPE=blackwell --no-cache .
|
docker buildx build --platform linux/arm64 --push --output type=image -t lmsysorg/sglang:${tag} -f docker/Dockerfile.gb200 --build-arg CUDA_VERSION=12.9.1 --build-arg BUILD_TYPE=blackwell --no-cache .
|
||||||
|
|||||||
14
.github/workflows/release-whl-kernel-aarch64.yml
vendored
14
.github/workflows/release-whl-kernel-aarch64.yml
vendored
@@ -17,17 +17,17 @@ concurrency:
|
|||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-cu128-aarch64:
|
build-cu129-aarch64:
|
||||||
if: github.repository == 'sgl-project/sglang'
|
if: github.repository == 'sgl-project/sglang'
|
||||||
runs-on: sgl-kernel-release-node
|
runs-on: sgl-kernel-release-node-arm
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ['3.9']
|
python-version: ["3.10"]
|
||||||
cuda-version: ['12.8']
|
cuda-version: ["12.9"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: 'recursive'
|
submodules: "recursive"
|
||||||
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
@@ -47,7 +47,7 @@ jobs:
|
|||||||
path: sgl-kernel/dist/*
|
path: sgl-kernel/dist/*
|
||||||
|
|
||||||
release:
|
release:
|
||||||
needs: build-cu128-aarch64
|
needs: build-cu129-aarch64
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
@@ -84,7 +84,7 @@ jobs:
|
|||||||
WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
|
WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
|
||||||
|
|
||||||
- name: Update wheel index
|
- name: Update wheel index
|
||||||
run: python3 scripts/update_kernel_whl_index.py --cuda 128
|
run: python3 scripts/update_kernel_whl_index.py --cuda 129
|
||||||
|
|
||||||
- name: Push wheel index
|
- name: Push wheel index
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
@@ -79,14 +79,17 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
|
|||||||
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.3/sgl_kernel-0.3.3-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
|
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.3/sgl_kernel-0.3.3-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Build and install NVSHMEM + DeepEP
|
# Download source files
|
||||||
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
|
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
|
||||||
&& git clone https://github.com/deepseek-ai/DeepEP.git \
|
git clone https://github.com/deepseek-ai/DeepEP.git && \
|
||||||
&& cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. \
|
cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. && \
|
||||||
&& tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \
|
tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
|
||||||
&& cd nvshmem \
|
mv nvshmem_src nvshmem && \
|
||||||
&& rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
|
rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
|
||||||
&& NVSHMEM_SHMEM_SUPPORT=0 \
|
|
||||||
|
# Build and install NVSHMEM
|
||||||
|
RUN cd /sgl-workspace/nvshmem && \
|
||||||
|
NVSHMEM_SHMEM_SUPPORT=0 \
|
||||||
NVSHMEM_UCX_SUPPORT=0 \
|
NVSHMEM_UCX_SUPPORT=0 \
|
||||||
NVSHMEM_USE_NCCL=0 \
|
NVSHMEM_USE_NCCL=0 \
|
||||||
NVSHMEM_MPI_SUPPORT=0 \
|
NVSHMEM_MPI_SUPPORT=0 \
|
||||||
@@ -94,10 +97,12 @@ RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/sour
|
|||||||
NVSHMEM_PMIX_SUPPORT=0 \
|
NVSHMEM_PMIX_SUPPORT=0 \
|
||||||
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
|
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
|
||||||
NVSHMEM_USE_GDRCOPY=1 \
|
NVSHMEM_USE_GDRCOPY=1 \
|
||||||
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES=90 \
|
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" && \
|
||||||
&& cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL} \
|
cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL}
|
||||||
&& cd /sgl-workspace/DeepEP \
|
|
||||||
&& NVSHMEM_DIR=${NVSHMEM_DIR} pip install .
|
# Install DeepEP
|
||||||
|
RUN cd /sgl-workspace/DeepEP && \
|
||||||
|
NVSHMEM_DIR=${NVSHMEM_DIR} pip install .
|
||||||
|
|
||||||
# Python tools
|
# Python tools
|
||||||
RUN python3 -m pip install --no-cache-dir \
|
RUN python3 -m pip install --no-cache-dir \
|
||||||
@@ -110,7 +115,8 @@ RUN python3 -m pip install --no-cache-dir \
|
|||||||
icdiff \
|
icdiff \
|
||||||
uv \
|
uv \
|
||||||
wheel \
|
wheel \
|
||||||
scikit-build-core
|
scikit-build-core \
|
||||||
|
nixl
|
||||||
|
|
||||||
# Install development tools and utilities
|
# Install development tools and utilities
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.9.1
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
|
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
|
||||||
|
|
||||||
ARG BUILD_TYPE=blackwell
|
ARG BUILD_TYPE=blackwell
|
||||||
@@ -38,7 +38,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& apt-get clean
|
&& apt-get clean
|
||||||
|
|
||||||
# --- Install SGLang missing package for blackwell build type
|
# Install SGLang missing package for blackwell build type
|
||||||
RUN python3 -m pip install openai httpx
|
RUN python3 -m pip install openai httpx
|
||||||
|
|
||||||
# GDRCopy installation
|
# GDRCopy installation
|
||||||
@@ -60,33 +60,39 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
|
|||||||
&& case "$CUDA_VERSION" in \
|
&& case "$CUDA_VERSION" in \
|
||||||
12.6.1) CUINDEX=126 ;; \
|
12.6.1) CUINDEX=126 ;; \
|
||||||
12.8.1) CUINDEX=128 ;; \
|
12.8.1) CUINDEX=128 ;; \
|
||||||
|
12.9.1) CUINDEX=129 ;; \
|
||||||
*) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
|
*) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
|
||||||
esac \
|
esac \
|
||||||
&& python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
|
&& python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
|
||||||
&& if [ "$CUDA_VERSION" = "12.8.1" ]; then \
|
&& if [ "$CUDA_VERSION" = "12.9.1" ]; then \
|
||||||
python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps ; \
|
python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps ; \
|
||||||
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.2.7/sgl_kernel-0.2.7+cu128-cp39-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
|
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.3/sgl_kernel-0.3.3+cu129-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Build and install NVSHMEM + DeepEP
|
# Download source files
|
||||||
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
|
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
|
||||||
&& git clone https://github.com/fzyzcjy/DeepEP.git \
|
git clone https://github.com/fzyzcjy/DeepEP.git && \
|
||||||
&& cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. \
|
cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. && \
|
||||||
&& tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \
|
tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
|
||||||
&& cd nvshmem \
|
mv nvshmem_src nvshmem && \
|
||||||
&& rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
|
rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
|
||||||
&& NVSHMEM_SHMEM_SUPPORT=0 \
|
|
||||||
NVSHMEM_UCX_SUPPORT=0 \
|
# Build and install NVSHMEM
|
||||||
NVSHMEM_USE_NCCL=0 \
|
RUN cd /sgl-workspace/nvshmem && \
|
||||||
NVSHMEM_MPI_SUPPORT=0 \
|
NVSHMEM_SHMEM_SUPPORT=0 \
|
||||||
NVSHMEM_IBGDA_SUPPORT=1 \
|
NVSHMEM_UCX_SUPPORT=0 \
|
||||||
NVSHMEM_PMIX_SUPPORT=0 \
|
NVSHMEM_USE_NCCL=0 \
|
||||||
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
|
NVSHMEM_MPI_SUPPORT=0 \
|
||||||
NVSHMEM_USE_GDRCOPY=1 \
|
NVSHMEM_IBGDA_SUPPORT=1 \
|
||||||
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" \
|
NVSHMEM_PMIX_SUPPORT=0 \
|
||||||
&& cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL} \
|
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
|
||||||
&& cd /sgl-workspace/DeepEP \
|
NVSHMEM_USE_GDRCOPY=1 \
|
||||||
&& NVSHMEM_DIR=${NVSHMEM_DIR} pip install .
|
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" && \
|
||||||
|
cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL}
|
||||||
|
|
||||||
|
# Install DeepEP
|
||||||
|
RUN cd /sgl-workspace/DeepEP && \
|
||||||
|
NVSHMEM_DIR=${NVSHMEM_DIR} pip install .
|
||||||
|
|
||||||
# Python tools
|
# Python tools
|
||||||
RUN python3 -m pip install --no-cache-dir \
|
RUN python3 -m pip install --no-cache-dir \
|
||||||
@@ -106,7 +112,7 @@ RUN python3 -m pip install --no-cache-dir \
|
|||||||
nvidia-cudnn-cu12 \
|
nvidia-cudnn-cu12 \
|
||||||
nvidia-cudnn-frontend
|
nvidia-cudnn-frontend
|
||||||
|
|
||||||
# Allows for FP4 disaggregation
|
# Install nixl kv transfer backend
|
||||||
RUN python3 -m pip install --no-cache-dir \
|
RUN python3 -m pip install --no-cache-dir \
|
||||||
nixl
|
nixl
|
||||||
|
|
||||||
@@ -163,6 +169,12 @@ RUN python3 -m pip install --no-cache-dir --break-system-packages \
|
|||||||
matplotlib \
|
matplotlib \
|
||||||
tabulate
|
tabulate
|
||||||
|
|
||||||
|
# Install flashinfer from source to fix a bug
|
||||||
|
# https://github.com/flashinfer-ai/flashinfer/pull/1413
|
||||||
|
# FIXME: remove this once flashinfer release > 0.2.10
|
||||||
|
WORKDIR /sgl-workspace
|
||||||
|
RUN git clone https://github.com/flashinfer-ai/flashinfer.git --recursive && cd flashinfer && python3 -m pip install -v .
|
||||||
|
|
||||||
# Install diff-so-fancy
|
# Install diff-so-fancy
|
||||||
RUN curl -LSso /usr/local/bin/diff-so-fancy https://github.com/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \
|
RUN curl -LSso /usr/local/bin/diff-so-fancy https://github.com/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \
|
||||||
&& chmod +x /usr/local/bin/diff-so-fancy
|
&& chmod +x /usr/local/bin/diff-so-fancy
|
||||||
|
|||||||
@@ -259,7 +259,7 @@ class Engine(EngineBase):
|
|||||||
f"data_parallel_rank must be in range [0, {self.server_args.dp_size-1}]"
|
f"data_parallel_rank must be in range [0, {self.server_args.dp_size-1}]"
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"data_parallel_rank: {data_parallel_rank}")
|
logger.debug(f"data_parallel_rank: {data_parallel_rank}")
|
||||||
obj = GenerateReqInput(
|
obj = GenerateReqInput(
|
||||||
text=prompt,
|
text=prompt,
|
||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
|
|||||||
@@ -39,6 +39,13 @@ docker run --rm \
|
|||||||
# Install CMake (version >= 3.26) - Robust Installation
|
# Install CMake (version >= 3.26) - Robust Installation
|
||||||
export CMAKE_VERSION_MAJOR=3.31
|
export CMAKE_VERSION_MAJOR=3.31
|
||||||
export CMAKE_VERSION_MINOR=1
|
export CMAKE_VERSION_MINOR=1
|
||||||
|
# Setting these flags to reduce OOM chance only on ARM
|
||||||
|
if [ \"${ARCH}\" = \"aarch64\" ]; then
|
||||||
|
export CUDA_NVCC_FLAGS=\"-Xcudafe --threads=2\"
|
||||||
|
export MAKEFLAGS='-j2'
|
||||||
|
export CMAKE_BUILD_PARALLEL_LEVEL=2
|
||||||
|
export NINJAFLAGS='-j2'
|
||||||
|
fi
|
||||||
echo \"Downloading CMake from: https://cmake.org/files/v\${CMAKE_VERSION_MAJOR}/cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz\"
|
echo \"Downloading CMake from: https://cmake.org/files/v\${CMAKE_VERSION_MAJOR}/cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz\"
|
||||||
wget https://cmake.org/files/v\${CMAKE_VERSION_MAJOR}/cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz
|
wget https://cmake.org/files/v\${CMAKE_VERSION_MAJOR}/cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz
|
||||||
tar -xzf cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz
|
tar -xzf cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz
|
||||||
|
|||||||
@@ -7,8 +7,19 @@ wheel_files=($WHEEL_DIR/*.whl)
|
|||||||
for wheel in "${wheel_files[@]}"; do
|
for wheel in "${wheel_files[@]}"; do
|
||||||
intermediate_wheel="${wheel/linux/manylinux2014}"
|
intermediate_wheel="${wheel/linux/manylinux2014}"
|
||||||
|
|
||||||
if ls /usr/local/ | grep -q "12.8"; then
|
# Extract the current python version from the wheel name
|
||||||
new_wheel="${intermediate_wheel/-cp39/+cu128-cp39}"
|
if [[ $intermediate_wheel =~ -cp([0-9]+)- ]]; then
|
||||||
|
cp_version="${BASH_REMATCH[1]}"
|
||||||
|
else
|
||||||
|
echo "Could not extract Python version from wheel name: $intermediate_wheel"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Detect CUDA version and add appropriate suffix
|
||||||
|
if ls /usr/local/ | grep -q "12.9"; then
|
||||||
|
new_wheel="${intermediate_wheel/-cp${cp_version}/+cu129-cp${cp_version}}"
|
||||||
|
elif ls /usr/local/ | grep -q "12.8"; then
|
||||||
|
new_wheel="${intermediate_wheel/-cp${cp_version}/+cu128-cp${cp_version}}"
|
||||||
else
|
else
|
||||||
new_wheel="$intermediate_wheel"
|
new_wheel="$intermediate_wheel"
|
||||||
fi
|
fi
|
||||||
|
|||||||
Reference in New Issue
Block a user