feat: Add SageMaker support (#3740)

2025-02-21 22:31:09 +11:00
parent 0c227ee373
commit 1df6eabd5d
4 changed files with 299 additions and 0 deletions
--- a/docker/Dockerfile.sagemaker
+++ b/docker/Dockerfile.sagemaker
@@ -0,0 +1,78 @@
+ARG CUDA_VERSION=12.5.1
+
+FROM nvcr.io/nvidia/tritonserver:24.04-py3-min
+
+ARG BUILD_TYPE=all
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt update -y \
+    && apt install software-properties-common -y \
+    && add-apt-repository ppa:deadsnakes/ppa -y && apt update \
+    && apt install python3.10 python3.10-dev -y \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 \
+    && update-alternatives --set python3 /usr/bin/python3.10 && apt install python3.10-distutils -y \
+    && apt install curl git sudo libibverbs-dev -y \
+    && apt install -y rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 \
+    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py \
+    && python3 --version \
+    && python3 -m pip --version \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt clean
+
+# For openbmb/MiniCPM models
+RUN pip3 install datamodel_code_generator
+
+WORKDIR /sgl-workspace
+
+ARG CUDA_VERSION
+RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \
+    && git clone --depth=1 https://github.com/sgl-project/sglang.git \
+    && if [ "$CUDA_VERSION" = "12.1.1" ]; then \
+         python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu121; \
+       elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
+         python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu124; \
+       elif [ "$CUDA_VERSION" = "12.5.1" ]; then \
+         python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu124; \
+       elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
+         python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118; \
+         python3 -m pip install sgl-kernel -i https://docs.sglang.ai/whl/cu118; \
+       else \
+         echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
+       fi \
+    && cd sglang \
+    && if [ "$BUILD_TYPE" = "srt" ]; then \
+         if [ "$CUDA_VERSION" = "12.1.1" ]; then \
+           python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu121/torch2.5/flashinfer-python; \
+         elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
+           python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python; \
+         elif [ "$CUDA_VERSION" = "12.5.1" ]; then \
+           python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python; \
+         elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
+           python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu118/torch2.5/flashinfer-python; \
+           python3 -m pip install sgl-kernel -i https://docs.sglang.ai/whl/cu118; \
+         else \
+           echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
+         fi; \
+       else \
+         if [ "$CUDA_VERSION" = "12.1.1" ]; then \
+           python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu121/torch2.5/flashinfer-python; \
+         elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
+           python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python; \
+         elif [ "$CUDA_VERSION" = "12.5.1" ]; then \
+           python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python; \
+         elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
+           python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu118/torch2.5/flashinfer-python; \
+           python3 -m pip install sgl-kernel -i https://docs.sglang.ai/whl/cu118; \
+         else \
+           echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
+         fi; \
+       fi
+
+ENV DEBIAN_FRONTEND=interactive
+
+COPY serve /usr/bin/serve
+RUN chmod 777 /usr/bin/serve
+
+ENTRYPOINT [ "/usr/bin/serve" ]
--- a/docker/serve
+++ b/docker/serve
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+echo "Starting server"
+
+SERVER_ARGS="--host 0.0.0.0 --port 8080"
+
+if [ -n "$TENSOR_PARALLEL_DEGREE" ]; then
+    SERVER_ARGS="${SERVER_ARGS} --tp-size ${TENSOR_PARALLEL_DEGREE}"
+fi
+
+if [ -n "$DATA_PARALLEL_DEGREE" ]; then
+    SERVER_ARGS="${SERVER_ARGS} --dp-size ${DATA_PARALLEL_DEGREE}"
+fi
+
+if [ -n "$EXPERT_PARALLEL_DEGREE" ]; then
+    SERVER_ARGS="${SERVER_ARGS} --ep-size ${EXPERT_PARALLEL_DEGREE}"
+fi
+
+if [ -n "$MEM_FRACTION_STATIC" ]; then
+    SERVER_ARGS="${SERVER_ARGS} --mem-fraction-static ${MEM_FRACTION_STATIC}"
+fi
+
+if [ -n "$QUANTIZATION" ]; then
+    SERVER_ARGS="${SERVER_ARGS} --quantization ${QUANTIZATION}"
+fi
+
+if [ -n "$CHUNKED_PREFILL_SIZE" ]; then
+    SERVER_ARGS="${SERVER_ARGS} --chunked-prefill-size ${CHUNKED_PREFILL_SIZE}"
+fi
+
+python3 -m sglang.launch_server --model-path /opt/ml/model $SERVER_ARGS