diff --git a/.github/workflows/release-docker-gb200.yml b/.github/workflows/release-docker-gb200.yml index 9f7c6cf73..fbcacb330 100644 --- a/.github/workflows/release-docker-gb200.yml +++ b/.github/workflows/release-docker-gb200.yml @@ -11,7 +11,7 @@ jobs: publish: if: github.repository == 'sgl-project/sglang' runs-on: ubuntu-22.04-arm - environment: 'prod' + environment: "prod" steps: - name: Delete huge unnecessary tools folder run: rm -rf /opt/hostedtoolcache @@ -31,6 +31,6 @@ jobs: - name: Build and Push run: | version=$(cat python/sglang/version.py | cut -d'"' -f2) - tag=v${version}-cu128-gb200 + tag=v${version}-cu129-gb200 - docker buildx build --platform linux/arm64 --push --output type=image -t lmsysorg/sglang:${tag} -f docker/Dockerfile.gb200 --build-arg CUDA_VERSION=12.8.1 --build-arg BUILD_TYPE=blackwell --no-cache . + docker buildx build --platform linux/arm64 --push --output type=image -t lmsysorg/sglang:${tag} -f docker/Dockerfile.gb200 --build-arg CUDA_VERSION=12.9.1 --build-arg BUILD_TYPE=blackwell --no-cache . diff --git a/.github/workflows/release-whl-kernel-aarch64.yml b/.github/workflows/release-whl-kernel-aarch64.yml index 8e94fe8fb..5c0c09794 100644 --- a/.github/workflows/release-whl-kernel-aarch64.yml +++ b/.github/workflows/release-whl-kernel-aarch64.yml @@ -17,17 +17,17 @@ concurrency: cancel-in-progress: true jobs: - build-cu128-aarch64: + build-cu129-aarch64: if: github.repository == 'sgl-project/sglang' - runs-on: sgl-kernel-release-node + runs-on: sgl-kernel-release-node-arm strategy: matrix: - python-version: ['3.9'] - cuda-version: ['12.8'] + python-version: ["3.10"] + cuda-version: ["12.9"] steps: - uses: actions/checkout@v4 with: - submodules: 'recursive' + submodules: "recursive" - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 @@ -47,7 +47,7 @@ jobs: path: sgl-kernel/dist/* release: - needs: build-cu128-aarch64 + needs: build-cu129-aarch64 runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -84,7 +84,7 @@ jobs: WHL_TOKEN: ${{ secrets.WHL_TOKEN }} - name: Update wheel index - run: python3 scripts/update_kernel_whl_index.py --cuda 128 + run: python3 scripts/update_kernel_whl_index.py --cuda 129 - name: Push wheel index run: | diff --git a/docker/Dockerfile b/docker/Dockerfile index fbfb7f5c1..67c5e4007 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -79,14 +79,17 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.3/sgl_kernel-0.3.3-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \ fi -# Build and install NVSHMEM + DeepEP -RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && git clone https://github.com/deepseek-ai/DeepEP.git \ - && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. \ - && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \ - && cd nvshmem \ - && rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ - && NVSHMEM_SHMEM_SUPPORT=0 \ +# Download source files +RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \ + git clone https://github.com/deepseek-ai/DeepEP.git && \ + cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. && \ + tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \ + mv nvshmem_src nvshmem && \ + rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz + +# Build and install NVSHMEM +RUN cd /sgl-workspace/nvshmem && \ + NVSHMEM_SHMEM_SUPPORT=0 \ NVSHMEM_UCX_SUPPORT=0 \ NVSHMEM_USE_NCCL=0 \ NVSHMEM_MPI_SUPPORT=0 \ @@ -94,10 +97,12 @@ RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/sour NVSHMEM_PMIX_SUPPORT=0 \ NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ NVSHMEM_USE_GDRCOPY=1 \ - cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES=90 \ - && cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL} \ - && cd /sgl-workspace/DeepEP \ - && NVSHMEM_DIR=${NVSHMEM_DIR} pip install . + cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" && \ + cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL} + +# Install DeepEP +RUN cd /sgl-workspace/DeepEP && \ + NVSHMEM_DIR=${NVSHMEM_DIR} pip install . # Python tools RUN python3 -m pip install --no-cache-dir \ @@ -110,7 +115,8 @@ RUN python3 -m pip install --no-cache-dir \ icdiff \ uv \ wheel \ - scikit-build-core + scikit-build-core \ + nixl # Install development tools and utilities RUN apt-get update && apt-get install -y \ diff --git a/docker/Dockerfile.gb200 b/docker/Dockerfile.gb200 index 0d818692d..ba56b56d5 100644 --- a/docker/Dockerfile.gb200 +++ b/docker/Dockerfile.gb200 @@ -1,4 +1,4 @@ -ARG CUDA_VERSION=12.8.1 +ARG CUDA_VERSION=12.9.1 FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 ARG BUILD_TYPE=blackwell @@ -38,7 +38,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean -# --- Install SGLang missing package for blackwell build type +# Install SGLang missing package for blackwell build type RUN python3 -m pip install openai httpx # GDRCopy installation @@ -60,33 +60,39 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li && case "$CUDA_VERSION" in \ 12.6.1) CUINDEX=126 ;; \ 12.8.1) CUINDEX=128 ;; \ + 12.9.1) CUINDEX=129 ;; \ *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \ esac \ && python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \ - && if [ "$CUDA_VERSION" = "12.8.1" ]; then \ + && if [ "$CUDA_VERSION" = "12.9.1" ]; then \ python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps ; \ - python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.2.7/sgl_kernel-0.2.7+cu128-cp39-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \ + python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.3/sgl_kernel-0.3.3+cu129-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \ fi - # Build and install NVSHMEM + DeepEP -RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ -&& git clone https://github.com/fzyzcjy/DeepEP.git \ -&& cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. \ -&& tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \ -&& cd nvshmem \ -&& rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \ -&& NVSHMEM_SHMEM_SUPPORT=0 \ - NVSHMEM_UCX_SUPPORT=0 \ - NVSHMEM_USE_NCCL=0 \ - NVSHMEM_MPI_SUPPORT=0 \ - NVSHMEM_IBGDA_SUPPORT=1 \ - NVSHMEM_PMIX_SUPPORT=0 \ - NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ - NVSHMEM_USE_GDRCOPY=1 \ - cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" \ -&& cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL} \ -&& cd /sgl-workspace/DeepEP \ -&& NVSHMEM_DIR=${NVSHMEM_DIR} pip install . +# Download source files +RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \ + git clone https://github.com/fzyzcjy/DeepEP.git && \ + cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. && \ + tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \ + mv nvshmem_src nvshmem && \ + rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz + +# Build and install NVSHMEM +RUN cd /sgl-workspace/nvshmem && \ + NVSHMEM_SHMEM_SUPPORT=0 \ + NVSHMEM_UCX_SUPPORT=0 \ + NVSHMEM_USE_NCCL=0 \ + NVSHMEM_MPI_SUPPORT=0 \ + NVSHMEM_IBGDA_SUPPORT=1 \ + NVSHMEM_PMIX_SUPPORT=0 \ + NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ + NVSHMEM_USE_GDRCOPY=1 \ + cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" && \ + cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL} + +# Install DeepEP +RUN cd /sgl-workspace/DeepEP && \ + NVSHMEM_DIR=${NVSHMEM_DIR} pip install . # Python tools RUN python3 -m pip install --no-cache-dir \ @@ -106,7 +112,7 @@ RUN python3 -m pip install --no-cache-dir \ nvidia-cudnn-cu12 \ nvidia-cudnn-frontend -# Allows for FP4 disaggregation +# Install nixl kv transfer backend RUN python3 -m pip install --no-cache-dir \ nixl @@ -163,6 +169,12 @@ RUN python3 -m pip install --no-cache-dir --break-system-packages \ matplotlib \ tabulate +# Install flashinfer from source to fix a bug +# https://github.com/flashinfer-ai/flashinfer/pull/1413 +# FIXME: remove this once flashinfer release > 0.2.10 +WORKDIR /sgl-workspace +RUN git clone https://github.com/flashinfer-ai/flashinfer.git --recursive && cd flashinfer && python3 -m pip install -v . + # Install diff-so-fancy RUN curl -LSso /usr/local/bin/diff-so-fancy https://github.com/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \ && chmod +x /usr/local/bin/diff-so-fancy diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index c09a128b5..c9d83303c 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -259,7 +259,7 @@ class Engine(EngineBase): f"data_parallel_rank must be in range [0, {self.server_args.dp_size-1}]" ) - logger.info(f"data_parallel_rank: {data_parallel_rank}") + logger.debug(f"data_parallel_rank: {data_parallel_rank}") obj = GenerateReqInput( text=prompt, input_ids=input_ids, diff --git a/sgl-kernel/build.sh b/sgl-kernel/build.sh index 8ae22294a..e812c0c7b 100755 --- a/sgl-kernel/build.sh +++ b/sgl-kernel/build.sh @@ -39,6 +39,13 @@ docker run --rm \ # Install CMake (version >= 3.26) - Robust Installation export CMAKE_VERSION_MAJOR=3.31 export CMAKE_VERSION_MINOR=1 + # Setting these flags to reduce OOM chance only on ARM + if [ \"${ARCH}\" = \"aarch64\" ]; then + export CUDA_NVCC_FLAGS=\"-Xcudafe --threads=2\" + export MAKEFLAGS='-j2' + export CMAKE_BUILD_PARALLEL_LEVEL=2 + export NINJAFLAGS='-j2' + fi echo \"Downloading CMake from: https://cmake.org/files/v\${CMAKE_VERSION_MAJOR}/cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz\" wget https://cmake.org/files/v\${CMAKE_VERSION_MAJOR}/cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz tar -xzf cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz diff --git a/sgl-kernel/rename_wheels.sh b/sgl-kernel/rename_wheels.sh index b52a94a92..cab79e44e 100755 --- a/sgl-kernel/rename_wheels.sh +++ b/sgl-kernel/rename_wheels.sh @@ -7,8 +7,19 @@ wheel_files=($WHEEL_DIR/*.whl) for wheel in "${wheel_files[@]}"; do intermediate_wheel="${wheel/linux/manylinux2014}" - if ls /usr/local/ | grep -q "12.8"; then - new_wheel="${intermediate_wheel/-cp39/+cu128-cp39}" + # Extract the current python version from the wheel name + if [[ $intermediate_wheel =~ -cp([0-9]+)- ]]; then + cp_version="${BASH_REMATCH[1]}" + else + echo "Could not extract Python version from wheel name: $intermediate_wheel" + continue + fi + + # Detect CUDA version and add appropriate suffix + if ls /usr/local/ | grep -q "12.9"; then + new_wheel="${intermediate_wheel/-cp${cp_version}/+cu129-cp${cp_version}}" + elif ls /usr/local/ | grep -q "12.8"; then + new_wheel="${intermediate_wheel/-cp${cp_version}/+cu128-cp${cp_version}}" else new_wheel="$intermediate_wheel" fi