feat: support pip install sglang (#10465)

2025-09-15 03:09:17 -07:00
parent 059c13de5c
commit 5afd036533
8 changed files with 269 additions and 133 deletions
--- a/.github/workflows/pr-test-xeon.yml
+++ b/.github/workflows/pr-test-xeon.yml
@@ -58,6 +58,7 @@ jobs:
          docker exec ci_sglang_xeon bash -c "python3 -m pip install --upgrade pip"
          docker exec ci_sglang_xeon pip uninstall sgl-kernel -y || true
          docker exec -w /sglang-checkout/sgl-kernel ci_sglang_xeon bash -c "cp pyproject_cpu.toml pyproject.toml && pip install -v ."
          docker exec -w /sglang-checkout/ ci_sglang_xeon bash -c "rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml"
          docker exec -w /sglang-checkout/ ci_sglang_xeon bash -c "pip install -e "python[dev_cpu]""
      - name: Check AMX support
--- a/docker/Dockerfile.npu
+++ b/docker/Dockerfile.npu
@@ -77,7 +77,7 @@ RUN pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --inde
 # Install SGLang
 RUN git clone https://github.com/sgl-project/sglang --branch $SGLANG_TAG && \
-    (cd sglang/python && pip install -v .[srt_npu] --no-cache-dir) && \
+    (cd sglang/python && rm -rf pyproject.toml && mv pyproject_other.toml pyproject.toml && pip install -v .[srt_npu] --no-cache-dir) && \
    (cd sglang/sgl-router && python -m build && pip install --force-reinstall dist/*.whl) && \
    rm -rf sglang
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -181,6 +181,7 @@ RUN git clone ${SGL_REPO} \
    && mv pyproject_rocm.toml pyproject.toml \
    && AMDGPU_TARGET=$GPU_ARCH_LIST python setup_rocm.py install \
    && cd .. \
    && rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml \
    && if [ "$BUILD_TYPE" = "srt" ]; then \
         python -m pip --no-cache-dir install -e "python[srt_hip]" ${NO_DEPS_FLAG}; \
       else \
--- a/docker/Dockerfile.xeon
+++ b/docker/Dockerfile.xeon
@@ -35,6 +35,7 @@ RUN pip config set global.index-url https://download.pytorch.org/whl/cpu && \
 RUN git clone https://github.com/sgl-project/sglang.git && \
    cd sglang && \
    rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml && \
    git checkout ${VER_SGLANG} && \
    pip install -e "python[all_cpu]" && \
    pip install torch==${VER_TORCH} torchvision==${VER_TORCHVISION} triton==${VER_TRITON} --force-reinstall && \
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -10,131 +10,87 @@ readme = "README.md"
 requires-python = ">=3.10"
 license = { file = "LICENSE" }
 classifiers = [
-    "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3",
-    "License :: OSI Approved :: Apache Software License",
+  "License :: OSI Approved :: Apache Software License",
 ]
 dependencies = [
  "aiohttp",
  "requests",
  "tqdm",
  "numpy",
  "IPython",
  "setproctitle",
  "blobfile==3.0.0",
  "build",
  "compressed-tensors",
  "datasets",
  "einops",
  "fastapi",
  "hf_transfer",
  "huggingface_hub",
  "interegular",
  "llguidance>=0.7.11,<0.8.0",
  "modelscope",
  "msgspec",
  "ninja",
  "openai==1.99.1",
  "openai-harmony==0.0.4",
  "orjson",
  "outlines==0.1.11",
  "packaging",
  "partial_json_parser",
  "pillow",
  "prometheus-client>=0.20.0",
  "psutil",
  "pybase64",
  "pydantic",
  "pynvml",
  "python-multipart",
  "pyzmq>=25.1.2",
  "scipy",
  "sentencepiece",
  "soundfile==0.13.1",
  "timm==1.0.16",
  "tiktoken",
  "torchao==0.9.0",
  "transformers==4.56.1",
  "uvicorn",
  "uvloop",
  "xgrammar==0.1.24",
  "sgl-kernel==0.3.9.post2",
  "torch==2.8.0",
  "torchaudio==2.8.0",
  "torchvision",
  "cuda-python",
  "flashinfer_python==0.3.1",
  "openai==1.99.1",
  "tiktoken",
  "anthropic>=0.20.0",
  "torch_memory_saver==0.0.8",
  "decord",
 ]
 dependencies = ["aiohttp", "requests", "tqdm", "numpy", "IPython", "setproctitle"]
 [project.optional-dependencies]
-runtime_common = [
+test = [
-    "blobfile==3.0.0",
+  "accelerate",
-    "build",
+  "expecttest",
-    "compressed-tensors",
+  "jsonlines",
-    "datasets",
+  "matplotlib",
-    "einops",
+  "pandas",
-    "fastapi",
+  "peft",
-    "hf_transfer",
+  "sentence_transformers",
-    "huggingface_hub",
+  "pytest",
-    "interegular",
+  "tabulate",
    "llguidance>=0.7.11,<0.8.0",
    "modelscope",
    "msgspec",
    "ninja",
    "openai==1.99.1",
    "openai-harmony==0.0.4",
    "orjson",
    "outlines==0.1.11",
    "packaging",
    "partial_json_parser",
    "pillow",
    "prometheus-client>=0.20.0",
    "psutil",
    "pybase64",
    "pydantic",
    "pynvml",
    "python-multipart",
    "pyzmq>=25.1.2",
    "scipy",
    "sentencepiece",
    "soundfile==0.13.1",
    "timm==1.0.16",
    "tiktoken",
    "torchao==0.9.0",
    "transformers==4.56.1",
    "uvicorn",
    "uvloop",
    "xgrammar==0.1.24",
 ]
 tracing = [
    "opentelemetry-sdk",
    "opentelemetry-api",
    "opentelemetry-exporter-otlp",
    "opentelemetry-exporter-otlp-proto-grpc",
 ]
-
+all = ["sglang[test]"]
-srt = [
+blackwell = ["nvidia-cutlass-dsl==4.1.0", "sglang[test]"]
-    "sglang[runtime_common]",
+dev = ["sglang[test]"]
    "sgl-kernel==0.3.9.post2",
    "torch==2.8.0",
    "torchaudio==2.8.0",
    "torchvision",
    "cuda-python",
    "flashinfer_python==0.3.1",
 ]
 blackwell = [
    "sglang[runtime_common]",
    "sgl-kernel==0.3.9.post2",
    "torch==2.8.0",
    "torchaudio==2.8.0",
    "torchvision",
    "cuda-python",
    "flashinfer_python==0.3.1",
    "nvidia-cutlass-dsl==4.1.0",
 ]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20250114, not from public vllm whl
 srt_hip = [
    "sglang[runtime_common]",
    "torch",
    "petit_kernel==0.0.2",
    "wave-lang==3.7.0",
 ]
 # https://docs.sglang.ai/platforms/cpu_server.html
 srt_cpu = ["sglang[runtime_common]", "intel-openmp"]
 # https://docs.sglang.ai/platforms/ascend_npu.html
 srt_npu = ["sglang[runtime_common]"]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
 srt_xpu = ["sglang[runtime_common]"]
 # For Intel Gaudi(device : hpu) follow the installation guide
 # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
 srt_hpu = ["sglang[runtime_common]"]
 openai = ["openai==1.99.1", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
 torch_memory_saver = ["torch_memory_saver==0.0.8"]
 decord = ["decord"]
 test = [
    "accelerate",
    "expecttest",
    "jsonlines",
    "matplotlib",
    "pandas",
    "peft",
    "sentence_transformers",
    "pytest",
    "tabulate",
 ]
 all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[torch_memory_saver]", "sglang[decord]"]
 all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
 all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
 all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
 all_cpu = ["sglang[srt_cpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
 all_npu = ["sglang[srt_npu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
 dev = ["sglang[all]", "sglang[test]"]
 dev_hip = ["sglang[all_hip]", "sglang[test]"]
 dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
 dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
 dev_cpu = ["sglang[all_cpu]", "sglang[test]"]
 [project.urls]
 "Homepage" = "https://github.com/sgl-project/sglang"
@@ -142,31 +98,31 @@ dev_cpu = ["sglang[all_cpu]", "sglang[test]"]
 [tool.setuptools.package-data]
 "sglang" = [
-    "srt/layers/moe/fused_moe_triton/configs/*/*.json",
+  "srt/layers/moe/fused_moe_triton/configs/*/*.json",
-    "srt/layers/quantization/configs/*.json",
+  "srt/layers/quantization/configs/*.json",
-    "srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp",
+  "srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp",
 ]
 [tool.setuptools.packages.find]
 exclude = [
-    "assets*",
+  "assets*",
-    "benchmark*",
+  "benchmark*",
-    "docs*",
+  "docs*",
-    "dist*",
+  "dist*",
-    "playground*",
+  "playground*",
-    "scripts*",
+  "scripts*",
-    "tests*",
+  "tests*",
 ]
 [tool.wheel]
 exclude = [
-    "assets*",
+  "assets*",
-    "benchmark*",
+  "benchmark*",
-    "docs*",
+  "docs*",
-    "dist*",
+  "dist*",
-    "playground*",
+  "playground*",
-    "scripts*",
+  "scripts*",
-    "tests*",
+  "tests*",
 ]
 [tool.codespell]
--- a/python/pyproject_other.toml
+++ b/python/pyproject_other.toml
@@ -0,0 +1,174 @@
 [build-system]
 requires = ["setuptools>=61.0", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
 version = "0.5.2"
 description = "SGLang is a fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.10"
 license = { file = "LICENSE" }
 classifiers = [
    "Programming Language :: Python :: 3",
    "License :: OSI Approved :: Apache Software License",
 ]
 dependencies = ["aiohttp", "requests", "tqdm", "numpy", "IPython", "setproctitle"]
 [project.optional-dependencies]
 runtime_common = [
    "blobfile==3.0.0",
    "build",
    "compressed-tensors",
    "datasets",
    "einops",
    "fastapi",
    "hf_transfer",
    "huggingface_hub",
    "interegular",
    "llguidance>=0.7.11,<0.8.0",
    "modelscope",
    "msgspec",
    "ninja",
    "openai==1.99.1",
    "openai-harmony==0.0.4",
    "orjson",
    "outlines==0.1.11",
    "packaging",
    "partial_json_parser",
    "pillow",
    "prometheus-client>=0.20.0",
    "psutil",
    "pybase64",
    "pydantic",
    "pynvml",
    "python-multipart",
    "pyzmq>=25.1.2",
    "scipy",
    "sentencepiece",
    "soundfile==0.13.1",
    "timm==1.0.16",
    "tiktoken",
    "torchao==0.9.0",
    "transformers==4.56.1",
    "uvicorn",
    "uvloop",
    "xgrammar==0.1.24",
 ]
 tracing = [
    "opentelemetry-sdk",
    "opentelemetry-api",
    "opentelemetry-exporter-otlp",
    "opentelemetry-exporter-otlp-proto-grpc",
 ]
 srt = [
    "sglang[runtime_common]",
    "sgl-kernel==0.3.9.post2",
    "torch==2.8.0",
    "torchaudio==2.8.0",
    "torchvision",
    "cuda-python",
    "flashinfer_python==0.3.1",
 ]
 blackwell = [
    "sglang[runtime_common]",
    "sgl-kernel==0.3.9.post2",
    "torch==2.8.0",
    "torchaudio==2.8.0",
    "torchvision",
    "cuda-python",
    "flashinfer_python==0.3.1",
    "nvidia-cutlass-dsl==4.1.0",
 ]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20250114, not from public vllm whl
 srt_hip = [
    "sglang[runtime_common]",
    "torch",
    "petit_kernel==0.0.2",
    "wave-lang==3.7.0",
 ]
 # https://docs.sglang.ai/platforms/cpu_server.html
 srt_cpu = ["sglang[runtime_common]", "intel-openmp"]
 # https://docs.sglang.ai/platforms/ascend_npu.html
 srt_npu = ["sglang[runtime_common]"]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
 srt_xpu = ["sglang[runtime_common]"]
 # For Intel Gaudi(device : hpu) follow the installation guide
 # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
 srt_hpu = ["sglang[runtime_common]"]
 openai = ["openai==1.99.1", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
 torch_memory_saver = ["torch_memory_saver==0.0.8"]
 decord = ["decord"]
 test = [
    "accelerate",
    "expecttest",
    "jsonlines",
    "matplotlib",
    "pandas",
    "peft",
    "sentence_transformers",
    "pytest",
    "tabulate",
 ]
 all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[torch_memory_saver]", "sglang[decord]"]
 all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
 all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
 all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
 all_cpu = ["sglang[srt_cpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
 all_npu = ["sglang[srt_npu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
 dev = ["sglang[all]", "sglang[test]"]
 dev_hip = ["sglang[all_hip]", "sglang[test]"]
 dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
 dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
 dev_cpu = ["sglang[all_cpu]", "sglang[test]"]
 [project.urls]
 "Homepage" = "https://github.com/sgl-project/sglang"
 "Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
 [tool.setuptools.package-data]
 "sglang" = [
    "srt/layers/moe/fused_moe_triton/configs/*/*.json",
    "srt/layers/quantization/configs/*.json",
    "srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp",
 ]
 [tool.setuptools.packages.find]
 exclude = [
    "assets*",
    "benchmark*",
    "docs*",
    "dist*",
    "playground*",
    "scripts*",
    "tests*",
 ]
 [tool.wheel]
 exclude = [
    "assets*",
    "benchmark*",
    "docs*",
    "dist*",
    "playground*",
    "scripts*",
    "tests*",
 ]
 [tool.codespell]
 ignore-words-list = "ans, als, hel, boostrap, childs, te, vas, hsa, ment"
 skip = "*.json,*.jsonl,*.patch,*.txt"
--- a/scripts/ci/amd_ci_install_dependency.sh
+++ b/scripts/ci/amd_ci_install_dependency.sh
@@ -19,6 +19,7 @@ docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.to
 case "${GPU_ARCH}" in
  mi35x)
    echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
    docker exec ci_sglang rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml
    docker exec ci_sglang pip install -e "python[dev_hip]" --no-deps # TODO: only for mi35x
    # For lmms_evals evaluating MMMU
    docker exec -w / ci_sglang git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
@@ -26,6 +27,7 @@ case "${GPU_ARCH}" in
    ;;
  mi30x|mi300|mi325)
    echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
    docker exec ci_sglang rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml
    docker exec ci_sglang pip install -e "python[dev_hip]"
    # For lmms_evals evaluating MMMU
    docker exec -w / ci_sglang git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
--- a/scripts/ci/npu_ci_install_dependency.sh
+++ b/scripts/ci/npu_ci_install_dependency.sh
@@ -64,4 +64,5 @@ git clone --depth 1 https://github.com/sgl-project/sgl-kernel-npu.git --branch $
 ### Install SGLang
 rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml
 ${PIP_INSTALL} -v -e "python[srt_npu]"